From 9efd0e128ef464d45379e84abf15e98ad1b24d40 Mon Sep 17 00:00:00 2001 From: Tomasz Stefaniak Date: Mon, 2 Jun 2025 13:51:36 -0700 Subject: [PATCH] fix: formatting --- .continue/rules/documentation-standards.md | 2 +- core/llm/countTokens.ts | 2 +- .../rules/getSystemMessageWithRules.test.ts | 70 +- core/vendor/modules/.package-lock.json | 6 +- .../modules/@xenova/transformers/README.md | 186 +- .../@xenova/transformers/src/backends/onnx.js | 43 +- .../@xenova/transformers/src/configs.js | 109 +- .../modules/@xenova/transformers/src/env.js | 85 +- .../@xenova/transformers/src/models.js | 7197 +++++++++-------- .../@xenova/transformers/src/pipelines.js | 3631 +++++---- .../@xenova/transformers/src/processors.js | 3381 ++++---- .../@xenova/transformers/src/tokenizers.js | 7035 ++++++++-------- .../@xenova/transformers/src/transformers.js | 8 +- .../@xenova/transformers/src/utils/audio.js | 914 ++- .../@xenova/transformers/src/utils/core.js | 100 +- .../transformers/src/utils/data-structures.js | 687 +- .../transformers/src/utils/generation.js | 1300 +-- .../@xenova/transformers/src/utils/hub.js | 1030 +-- .../@xenova/transformers/src/utils/image.js | 1323 +-- .../@xenova/transformers/src/utils/maths.js | 1502 ++-- .../@xenova/transformers/src/utils/tensor.js | 1804 +++-- .../transformers/types/backends/onnx.d.ts | 4 +- .../@xenova/transformers/types/configs.d.ts | 76 +- .../@xenova/transformers/types/env.d.ts | 36 +- .../@xenova/transformers/types/models.d.ts | 3584 ++++---- .../@xenova/transformers/types/pipelines.d.ts | 1938 +++-- .../transformers/types/processors.d.ts | 1266 +-- .../transformers/types/tokenizers.d.ts | 1559 ++-- .../transformers/types/transformers.d.ts | 2 +- .../transformers/types/utils/audio.d.ts | 61 +- .../transformers/types/utils/core.d.ts | 6 +- .../types/utils/data-structures.d.ts | 380 +- .../transformers/types/utils/generation.d.ts | 875 +- .../@xenova/transformers/types/utils/hub.d.ts | 214 +- .../transformers/types/utils/image.d.ts | 232 +- .../transformers/types/utils/maths.d.ts | 374 +- .../transformers/types/utils/tensor.d.ts | 467 +- docs/docs/customize/changelog.md | 2 +- extensions/vscode/e2e/tests/TODO.md | 2 +- .../vscode/models/all-MiniLM-L6-v2/README.md | 16 +- .../models/all-MiniLM-L6-v2/tokenizer.json | 18 +- .../vscode/src/extension/VsCodeMessenger.ts | 1 - .../src/quickEdit/EditDecorationManager.ts | 16 +- .../utils/remarkTables.tsx | 2 +- .../mainInput/belowMainInput/RulesPeek.tsx | 2 +- packages/continue-sdk/python/api/README.md | 33 +- .../python/api/docs/DefaultApi.md | 37 +- .../docs/ListAssistants200ResponseInner.md | 22 +- ...tAssistants200ResponseInnerConfigResult.md | 14 +- .../api/docs/ListAssistants401Response.md | 10 +- .../api/docs/ListAssistants404Response.md | 10 +- .../continue-sdk/typescript/api/README.md | 18 +- .../typescript/api/src/apis/DefaultApi.ts | 167 +- .../typescript/api/src/apis/index.ts | 2 +- .../continue-sdk/typescript/api/src/index.ts | 6 +- .../models/ListAssistants200ResponseInner.ts | 197 +- ...tAssistants200ResponseInnerConfigResult.ts | 115 +- .../src/models/ListAssistants401Response.ts | 75 +- .../src/models/ListAssistants404Response.ts | 75 +- .../typescript/api/src/models/index.ts | 8 +- .../typescript/api/src/runtime.ts | 686 +- .../continue-sdk/typescript/api/tsconfig.json | 9 +- 62 files changed, 22524 insertions(+), 20508 deletions(-) diff --git a/.continue/rules/documentation-standards.md b/.continue/rules/documentation-standards.md index ff7235857..a510b7ca7 100644 --- a/.continue/rules/documentation-standards.md +++ b/.continue/rules/documentation-standards.md @@ -13,4 +13,4 @@ description: Standards for writing and maintaining Continue Docs - Include cross-references to related documentation - Reference other docs with relative paths - Keep paragraphs concise and scannable -- Use code blocks with appropriate language tags \ No newline at end of file +- Use code blocks with appropriate language tags diff --git a/core/llm/countTokens.ts b/core/llm/countTokens.ts index 1571b2485..9ca8d1fe8 100644 --- a/core/llm/countTokens.ts +++ b/core/llm/countTokens.ts @@ -480,5 +480,5 @@ export { pruneLinesFromTop, pruneRawPromptFromTop, pruneStringFromBottom, - pruneStringFromTop + pruneStringFromTop, }; diff --git a/core/llm/rules/getSystemMessageWithRules.test.ts b/core/llm/rules/getSystemMessageWithRules.test.ts index 39f18e976..d9d12eaec 100644 --- a/core/llm/rules/getSystemMessageWithRules.test.ts +++ b/core/llm/rules/getSystemMessageWithRules.test.ts @@ -1,6 +1,9 @@ /* eslint-disable max-lines-per-function */ import { ContextItemWithId, RuleWithSource, UserChatMessage } from "../.."; -import { getSystemMessageWithRules, shouldApplyRule } from "./getSystemMessageWithRules"; +import { + getSystemMessageWithRules, + shouldApplyRule, +} from "./getSystemMessageWithRules"; describe("getSystemMessageWithRules", () => { const baseSystemMessage = "Base system message"; @@ -450,7 +453,8 @@ describe("getSystemMessageWithRules", () => { it("should include rules with alwaysApply: false when globs match", () => { const userMessage: UserChatMessage = { role: "user", - content: "```tsx Component.tsx\nexport const Component = () =>
Hello
;\n```", + content: + "```tsx Component.tsx\nexport const Component = () =>
Hello
;\n```", }; const result = getSystemMessageWithRules({ @@ -701,14 +705,18 @@ describe("shouldApplyRule", () => { it("should return true when alwaysApply is true, regardless of file paths", () => { expect(shouldApplyRule(ruleAlwaysApplyTrue, [])).toBe(true); expect(shouldApplyRule(ruleAlwaysApplyTrue, ["src/main.js"])).toBe(true); - expect(shouldApplyRule(ruleAlwaysApplyTrue, ["Component.tsx"])).toBe(true); + expect(shouldApplyRule(ruleAlwaysApplyTrue, ["Component.tsx"])).toBe( + true, + ); }); it("should use glob matching when alwaysApply is false", () => { // Should apply when globs match expect(shouldApplyRule(ruleAlwaysApplyFalse, ["src/main.ts"])).toBe(true); - expect(shouldApplyRule(ruleAlwaysApplyFalse, ["Component.tsx"])).toBe(true); - + expect(shouldApplyRule(ruleAlwaysApplyFalse, ["Component.tsx"])).toBe( + true, + ); + // Should not apply when globs don't match expect(shouldApplyRule(ruleAlwaysApplyFalse, ["script.py"])).toBe(false); expect(shouldApplyRule(ruleAlwaysApplyFalse, [])).toBe(false); @@ -716,7 +724,9 @@ describe("shouldApplyRule", () => { it("should return false when alwaysApply is false and no globs specified", () => { expect(shouldApplyRule(ruleAlwaysApplyFalseNoGlobs, [])).toBe(false); - expect(shouldApplyRule(ruleAlwaysApplyFalseNoGlobs, ["any-file.js"])).toBe(false); + expect( + shouldApplyRule(ruleAlwaysApplyFalseNoGlobs, ["any-file.js"]), + ).toBe(false); }); }); @@ -724,7 +734,9 @@ describe("shouldApplyRule", () => { it("should return true for rules without globs regardless of file paths", () => { expect(shouldApplyRule(ruleWithoutGlobs, [])).toBe(true); expect(shouldApplyRule(ruleWithoutGlobs, ["src/main.js"])).toBe(true); - expect(shouldApplyRule(ruleWithoutGlobs, ["Component.tsx", "utils.py"])).toBe(true); + expect( + shouldApplyRule(ruleWithoutGlobs, ["Component.tsx", "utils.py"]), + ).toBe(true); }); it("should return false for rules with globs when no file paths are provided", () => { @@ -734,12 +746,16 @@ describe("shouldApplyRule", () => { it("should return true for rules with globs when matching file paths are provided", () => { expect(shouldApplyRule(ruleWithGlobs, ["Component.tsx"])).toBe(true); expect(shouldApplyRule(ruleWithGlobs, ["src/main.ts"])).toBe(true); - expect(shouldApplyRule(ruleWithGlobs, ["utils.js", "Component.tsx"])).toBe(true); + expect( + shouldApplyRule(ruleWithGlobs, ["utils.js", "Component.tsx"]), + ).toBe(true); }); it("should return false for rules with globs when no matching file paths are provided", () => { expect(shouldApplyRule(ruleWithGlobs, ["utils.py"])).toBe(false); - expect(shouldApplyRule(ruleWithGlobs, ["main.js", "script.rb"])).toBe(false); + expect(shouldApplyRule(ruleWithGlobs, ["main.js", "script.rb"])).toBe( + false, + ); }); }); @@ -760,19 +776,39 @@ describe("shouldApplyRule", () => { it("should handle array of glob patterns", () => { expect(shouldApplyRule(ruleWithArrayGlobs, ["src/main.ts"])).toBe(true); - expect(shouldApplyRule(ruleWithArrayGlobs, ["tests/unit.test.js"])).toBe(true); - expect(shouldApplyRule(ruleWithArrayGlobs, ["config/settings.json"])).toBe(false); + expect(shouldApplyRule(ruleWithArrayGlobs, ["tests/unit.test.js"])).toBe( + true, + ); + expect( + shouldApplyRule(ruleWithArrayGlobs, ["config/settings.json"]), + ).toBe(false); }); it("should handle string glob patterns", () => { expect(shouldApplyRule(ruleWithSpecificPattern, ["utils.py"])).toBe(true); - expect(shouldApplyRule(ruleWithSpecificPattern, ["src/models/user.py"])).toBe(true); - expect(shouldApplyRule(ruleWithSpecificPattern, ["utils.js"])).toBe(false); + expect( + shouldApplyRule(ruleWithSpecificPattern, ["src/models/user.py"]), + ).toBe(true); + expect(shouldApplyRule(ruleWithSpecificPattern, ["utils.js"])).toBe( + false, + ); }); it("should return true if any file path matches when multiple paths provided", () => { - expect(shouldApplyRule(ruleWithSpecificPattern, ["utils.js", "models.py", "config.json"])).toBe(true); - expect(shouldApplyRule(ruleWithGlobs, ["utils.py", "Component.tsx", "script.rb"])).toBe(true); + expect( + shouldApplyRule(ruleWithSpecificPattern, [ + "utils.js", + "models.py", + "config.json", + ]), + ).toBe(true); + expect( + shouldApplyRule(ruleWithGlobs, [ + "utils.py", + "Component.tsx", + "script.rb", + ]), + ).toBe(true); }); }); @@ -784,7 +820,7 @@ describe("shouldApplyRule", () => { globs: [], source: "rules-block", }; - + // Empty array should be treated as "no globs" (truthy check fails) expect(shouldApplyRule(ruleWithEmptyGlobs, ["any-file.js"])).toBe(false); }); @@ -796,7 +832,7 @@ describe("shouldApplyRule", () => { globs: undefined, source: "rules-block", }; - + expect(shouldApplyRule(ruleUndefinedGlobs, ["any-file.js"])).toBe(true); expect(shouldApplyRule(ruleUndefinedGlobs, [])).toBe(true); }); diff --git a/core/vendor/modules/.package-lock.json b/core/vendor/modules/.package-lock.json index 75d2e2367..b8ddf4a5f 100644 --- a/core/vendor/modules/.package-lock.json +++ b/core/vendor/modules/.package-lock.json @@ -383,11 +383,7 @@ "resolved": "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz", "integrity": "sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w==", "optional": true, - "os": [ - "win32", - "darwin", - "linux" - ], + "os": ["win32", "darwin", "linux"], "dependencies": { "onnxruntime-common": "~1.14.0" } diff --git a/core/vendor/modules/@xenova/transformers/README.md b/core/vendor/modules/@xenova/transformers/README.md index d0f9bec49..dd10e4124 100644 --- a/core/vendor/modules/@xenova/transformers/README.md +++ b/core/vendor/modules/@xenova/transformers/README.md @@ -1,5 +1,3 @@ - -


@@ -28,23 +26,21 @@

- State-of-the-art Machine Learning for the web. Run ๐Ÿค— Transformers directly in your browser, with no need for a server! Transformers.js is designed to be functionally equivalent to Hugging Face's [transformers](https://github.com/huggingface/transformers) python library, meaning you can run the same pretrained models using a very similar API. These models support common tasks in different modalities, such as: - - ๐Ÿ“ **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation. - - ๐Ÿ–ผ๏ธ **Computer Vision**: image classification, object detection, and segmentation. - - ๐Ÿ—ฃ๏ธ **Audio**: automatic speech recognition and audio classification. - - ๐Ÿ™ **Multimodal**: zero-shot image classification. -Transformers.js uses [ONNX Runtime](https://onnxruntime.ai/) to run models in the browser. The best part about it, is that you can easily [convert](#convert-your-models-to-onnx) your pretrained PyTorch, TensorFlow, or JAX models to ONNX using [๐Ÿค— Optimum](https://github.com/huggingface/optimum#onnx--onnx-runtime). +- ๐Ÿ“ **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation. +- ๐Ÿ–ผ๏ธ **Computer Vision**: image classification, object detection, and segmentation. +- ๐Ÿ—ฃ๏ธ **Audio**: automatic speech recognition and audio classification. +- ๐Ÿ™ **Multimodal**: zero-shot image classification. + +Transformers.js uses [ONNX Runtime](https://onnxruntime.ai/) to run models in the browser. The best part about it, is that you can easily [convert](#convert-your-models-to-onnx) your pretrained PyTorch, TensorFlow, or JAX models to ONNX using [๐Ÿค— Optimum](https://github.com/huggingface/optimum#onnx--onnx-runtime). For more information, check out the full [documentation](https://huggingface.co/docs/transformers.js). - ## Quick tour - It's super simple to translate from existing code! Just like the python library, we support the `pipeline` API. Pipelines group together a pretrained model with preprocessing of inputs and postprocessing of outputs, making it the easiest way to run models with the library. @@ -69,12 +65,12 @@ out = pipe('I love transformers!')
```javascript -import { pipeline } from '@xenova/transformers'; +import { pipeline } from "@xenova/transformers"; // Allocate a pipeline for sentiment-analysis -let pipe = await pipeline('sentiment-analysis'); +let pipe = await pipeline("sentiment-analysis"); -let out = await pipe('I love transformers!'); +let out = await pipe("I love transformers!"); // [{'label': 'POSITIVE', 'score': 0.999817686}] ``` @@ -82,74 +78,72 @@ let out = await pipe('I love transformers!');
- You can also use a different model by specifying the model id or path as the second argument to the `pipeline` function. For example: + ```javascript // Use a different model for sentiment-analysis -let pipe = await pipeline('sentiment-analysis', 'Xenova/bert-base-multilingual-uncased-sentiment'); +let pipe = await pipeline( + "sentiment-analysis", + "Xenova/bert-base-multilingual-uncased-sentiment", +); ``` - ## Installation - To install via [NPM](https://www.npmjs.com/package/@xenova/transformers), run: + ```bash npm i @xenova/transformers ``` Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with: + ```html ``` - ## Examples Want to jump straight in? Get started with one of our sample applications/templates: -| Name | Description | Links | -|-------------------|----------------------------------|-------------------------------| -| Whisper Web | Speech recognition w/ Whisper | [code](https://github.com/xenova/whisper-web), [demo](https://huggingface.co/spaces/Xenova/whisper-web) | -| Doodle Dash | Real-time sketch-recognition game | [blog](https://huggingface.co/blog/ml-web-games), [code](https://github.com/xenova/doodle-dash), [demo](https://huggingface.co/spaces/Xenova/doodle-dash) | -| Code Playground | In-browser code completion website | [code](./examples/code-completion/), [demo](https://huggingface.co/spaces/Xenova/ai-code-playground) | -| Semantic Image Search (client-side) | Search for images with text | [code](./examples/semantic-image-search-client/), [demo](https://huggingface.co/spaces/Xenova/semantic-image-search-client) | -| Semantic Image Search (server-side) | Search for images with text (Supabase) | [code](./examples/semantic-image-search/), [demo](https://huggingface.co/spaces/Xenova/semantic-image-search) | -| Vanilla JavaScript | In-browser object detection | [video](https://scrimba.com/scrim/cKm9bDAg), [code](./examples/vanilla-js/), [demo](https://huggingface.co/spaces/Scrimba/vanilla-js-object-detector) | -| React | Multilingual translation website | [code](./examples/react-translator/), [demo](https://huggingface.co/spaces/Xenova/react-translator) | -| Text to speech (client-side) | In-browser speech synthesis | [code](./examples/text-to-speech-client/), [demo](https://huggingface.co/spaces/Xenova/text-to-speech-client) | -| Browser extension | Text classification extension | [code](./examples/extension/) | -| Electron | Text classification application | [code](./examples/electron/) | -| Next.js (client-side) | Sentiment analysis (in-browser inference) | [code](./examples/next-client/), [demo](https://huggingface.co/spaces/Xenova/next-example-app) | -| Next.js (server-side) | Sentiment analysis (Node.js inference) | [code](./examples/next-server/), [demo](https://huggingface.co/spaces/Xenova/next-server-example-app) | -| Node.js | Sentiment analysis API | [code](./examples/node/) | -| Demo site | A collection of demos | [code](./examples/demo-site/), [demo](https://xenova.github.io/transformers.js/) | +| Name | Description | Links | +| ----------------------------------- | ----------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Whisper Web | Speech recognition w/ Whisper | [code](https://github.com/xenova/whisper-web), [demo](https://huggingface.co/spaces/Xenova/whisper-web) | +| Doodle Dash | Real-time sketch-recognition game | [blog](https://huggingface.co/blog/ml-web-games), [code](https://github.com/xenova/doodle-dash), [demo](https://huggingface.co/spaces/Xenova/doodle-dash) | +| Code Playground | In-browser code completion website | [code](./examples/code-completion/), [demo](https://huggingface.co/spaces/Xenova/ai-code-playground) | +| Semantic Image Search (client-side) | Search for images with text | [code](./examples/semantic-image-search-client/), [demo](https://huggingface.co/spaces/Xenova/semantic-image-search-client) | +| Semantic Image Search (server-side) | Search for images with text (Supabase) | [code](./examples/semantic-image-search/), [demo](https://huggingface.co/spaces/Xenova/semantic-image-search) | +| Vanilla JavaScript | In-browser object detection | [video](https://scrimba.com/scrim/cKm9bDAg), [code](./examples/vanilla-js/), [demo](https://huggingface.co/spaces/Scrimba/vanilla-js-object-detector) | +| React | Multilingual translation website | [code](./examples/react-translator/), [demo](https://huggingface.co/spaces/Xenova/react-translator) | +| Text to speech (client-side) | In-browser speech synthesis | [code](./examples/text-to-speech-client/), [demo](https://huggingface.co/spaces/Xenova/text-to-speech-client) | +| Browser extension | Text classification extension | [code](./examples/extension/) | +| Electron | Text classification application | [code](./examples/electron/) | +| Next.js (client-side) | Sentiment analysis (in-browser inference) | [code](./examples/next-client/), [demo](https://huggingface.co/spaces/Xenova/next-example-app) | +| Next.js (server-side) | Sentiment analysis (Node.js inference) | [code](./examples/next-server/), [demo](https://huggingface.co/spaces/Xenova/next-server-example-app) | +| Node.js | Sentiment analysis API | [code](./examples/node/) | +| Demo site | A collection of demos | [code](./examples/demo-site/), [demo](https://xenova.github.io/transformers.js/) | Check out the Transformers.js [template](https://huggingface.co/new-space?template=static-templates%2Ftransformers.js) on Hugging Face to get started in one click! - ## Custom usage - - By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@xenova/transformers@2.14.0/dist/), which should work out-of-the-box. You can customize this as follows: - ### Settings ```javascript -import { env } from '@xenova/transformers'; +import { env } from "@xenova/transformers"; // Specify a custom location for models (defaults to '/models/'). -env.localModelPath = '/path/to/models/'; +env.localModelPath = "/path/to/models/"; // Disable the loading of remote models from the Hugging Face Hub: env.allowRemoteModels = false; // Set location of .wasm files. Defaults to use a CDN. -env.backends.onnx.wasm.wasmPaths = '/path/to/files/'; +env.backends.onnx.wasm.wasmPaths = "/path/to/files/"; ``` For a full list of available settings, check out the [API Reference](https://huggingface.co/docs/transformers.js/api/env). @@ -163,6 +157,7 @@ python -m scripts.convert --quantize --model_id ``` For example, convert and quantize [bert-base-uncased](https://huggingface.co/bert-base-uncased) using: + ```bash python -m scripts.convert --quantize --model_id bert-base-uncased ``` @@ -181,7 +176,6 @@ bert-base-uncased/ For the full list of supported architectures, see the [Optimum documentation](https://huggingface.co/docs/optimum/main/en/exporters/onnx/overview). - ## Supported tasks/models Here is the list of all tasks and architectures currently supported by Transformers.js. @@ -191,78 +185,72 @@ to open up a feature request [here](https://github.com/xenova/transformers.js/is To find compatible models on the Hub, select the "transformers.js" library tag in the filter menu (or visit [this link](https://huggingface.co/models?library=transformers.js)). You can refine your search by selecting the task you're interested in (e.g., [text-classification](https://huggingface.co/models?pipeline_tag=text-classification&library=transformers.js)). - ### Tasks #### Natural Language Processing -| Task | ID | Description | Supported? | -|--------------------------|----|-------------|------------| -| [Conversational](https://huggingface.co/tasks/conversational) | `conversational` | Generating conversational text that is relevant, coherent and knowledgable given a prompt. | โŒ | -| [Fill-Mask](https://huggingface.co/tasks/fill-mask) | `fill-mask` | Masking some of the words in a sentence and predicting which words should replace those masks. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FillMaskPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers.js) | -| [Question Answering](https://huggingface.co/tasks/question-answering) | `question-answering` | Retrieve the answer to a question from a given text. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.QuestionAnsweringPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=question-answering&library=transformers.js) | -| [Sentence Similarity](https://huggingface.co/tasks/sentence-similarity) | `sentence-similarity` | Determining how similar two texts are. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js) | -| [Summarization](https://huggingface.co/tasks/summarization) | `summarization` | Producing a shorter version of a document while preserving its important information. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.SummarizationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=summarization&library=transformers.js) | -| [Table Question Answering](https://huggingface.co/tasks/table-question-answering) | `table-question-answering` | Answering a question about information from a given table. | โŒ | -| [Text Classification](https://huggingface.co/tasks/text-classification) | `text-classification` or `sentiment-analysis` | Assigning a label or class to a given text. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TextClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=text-classification&library=transformers.js) | -| [Text Generation](https://huggingface.co/tasks/text-generation#completion-generation-models) | `text-generation` | Producing new text by predicting the next word in a sequence. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TextGenerationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=text-generation&library=transformers.js) | -| [Text-to-text Generation](https://huggingface.co/tasks/text-generation#text-to-text-generation-models) | `text2text-generation` | Converting one text sequence into another text sequence. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.Text2TextGenerationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=text2text-generation&library=transformers.js) | -| [Token Classification](https://huggingface.co/tasks/token-classification) | `token-classification` or `ner` | Assigning a label to each token in a text. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TokenClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=token-classification&library=transformers.js) | -| [Translation](https://huggingface.co/tasks/translation) | `translation` | Converting text from one language to another. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TranslationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=translation&library=transformers.js) | -| [Zero-Shot Classification](https://huggingface.co/tasks/zero-shot-classification) | `zero-shot-classification` | Classifying text into classes that are unseen during training. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=zero-shot-classification&library=transformers.js) | +| Task | ID | Description | Supported? | +| ------------------------------------------------------------------------------------------------------ | --------------------------------------------- | ---------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [Conversational](https://huggingface.co/tasks/conversational) | `conversational` | Generating conversational text that is relevant, coherent and knowledgable given a prompt. | โŒ | +| [Fill-Mask](https://huggingface.co/tasks/fill-mask) | `fill-mask` | Masking some of the words in a sentence and predicting which words should replace those masks. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FillMaskPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=fill-mask&library=transformers.js) | +| [Question Answering](https://huggingface.co/tasks/question-answering) | `question-answering` | Retrieve the answer to a question from a given text. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.QuestionAnsweringPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=question-answering&library=transformers.js) | +| [Sentence Similarity](https://huggingface.co/tasks/sentence-similarity) | `sentence-similarity` | Determining how similar two texts are. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js) | +| [Summarization](https://huggingface.co/tasks/summarization) | `summarization` | Producing a shorter version of a document while preserving its important information. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.SummarizationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=summarization&library=transformers.js) | +| [Table Question Answering](https://huggingface.co/tasks/table-question-answering) | `table-question-answering` | Answering a question about information from a given table. | โŒ | +| [Text Classification](https://huggingface.co/tasks/text-classification) | `text-classification` or `sentiment-analysis` | Assigning a label or class to a given text. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TextClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=text-classification&library=transformers.js) | +| [Text Generation](https://huggingface.co/tasks/text-generation#completion-generation-models) | `text-generation` | Producing new text by predicting the next word in a sequence. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TextGenerationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=text-generation&library=transformers.js) | +| [Text-to-text Generation](https://huggingface.co/tasks/text-generation#text-to-text-generation-models) | `text2text-generation` | Converting one text sequence into another text sequence. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.Text2TextGenerationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=text2text-generation&library=transformers.js) | +| [Token Classification](https://huggingface.co/tasks/token-classification) | `token-classification` or `ner` | Assigning a label to each token in a text. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TokenClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=token-classification&library=transformers.js) | +| [Translation](https://huggingface.co/tasks/translation) | `translation` | Converting text from one language to another. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TranslationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=translation&library=transformers.js) | +| [Zero-Shot Classification](https://huggingface.co/tasks/zero-shot-classification) | `zero-shot-classification` | Classifying text into classes that are unseen during training. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=zero-shot-classification&library=transformers.js) | #### Vision -| Task | ID | Description | Supported? | -|--------------------------|----|-------------|------------| -| [Depth Estimation](https://huggingface.co/tasks/depth-estimation) | `depth-estimation` | Predicting the depth of objects present in an image. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.DepthEstimationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=depth-estimation&library=transformers.js) | -| [Image Classification](https://huggingface.co/tasks/image-classification) | `image-classification` | Assigning a label or class to an entire image. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-classification&library=transformers.js) | -| [Image Segmentation](https://huggingface.co/tasks/image-segmentation) | `image-segmentation` | Divides an image into segments where each pixel is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageSegmentationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-segmentation&library=transformers.js) | -| [Image-to-Image](https://huggingface.co/tasks/image-to-image) | `image-to-image` | Transforming a source image to match the characteristics of a target image or a target image domain. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToImagePipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-to-image&library=transformers.js) | -| [Mask Generation](https://huggingface.co/tasks/mask-generation) | `mask-generation` | Generate masks for the objects in an image. | โŒ | -| [Object Detection](https://huggingface.co/tasks/object-detection) | `object-detection` | Identify objects of certain defined classes within an image. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ObjectDetectionPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=object-detection&library=transformers.js) | -| [Video Classification](https://huggingface.co/tasks/video-classification) | n/a | Assigning a label or class to an entire video. | โŒ | -| [Unconditional Image Generation](https://huggingface.co/tasks/unconditional-image-generation) | n/a | Generating images with no condition in any context (like a prompt text or another image). | โŒ | +| Task | ID | Description | Supported? | +| --------------------------------------------------------------------------------------------- | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [Depth Estimation](https://huggingface.co/tasks/depth-estimation) | `depth-estimation` | Predicting the depth of objects present in an image. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.DepthEstimationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=depth-estimation&library=transformers.js) | +| [Image Classification](https://huggingface.co/tasks/image-classification) | `image-classification` | Assigning a label or class to an entire image. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-classification&library=transformers.js) | +| [Image Segmentation](https://huggingface.co/tasks/image-segmentation) | `image-segmentation` | Divides an image into segments where each pixel is mapped to an object. This task has multiple variants such as instance segmentation, panoptic segmentation and semantic segmentation. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageSegmentationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-segmentation&library=transformers.js) | +| [Image-to-Image](https://huggingface.co/tasks/image-to-image) | `image-to-image` | Transforming a source image to match the characteristics of a target image or a target image domain. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToImagePipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-to-image&library=transformers.js) | +| [Mask Generation](https://huggingface.co/tasks/mask-generation) | `mask-generation` | Generate masks for the objects in an image. | โŒ | +| [Object Detection](https://huggingface.co/tasks/object-detection) | `object-detection` | Identify objects of certain defined classes within an image. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ObjectDetectionPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=object-detection&library=transformers.js) | +| [Video Classification](https://huggingface.co/tasks/video-classification) | n/a | Assigning a label or class to an entire video. | โŒ | +| [Unconditional Image Generation](https://huggingface.co/tasks/unconditional-image-generation) | n/a | Generating images with no condition in any context (like a prompt text or another image). | โŒ | #### Audio -| Task | ID | Description | Supported? | -|--------------------------|----|-------------|------------| -| [Audio Classification](https://huggingface.co/tasks/audio-classification) | `audio-classification` | Assigning a label or class to a given audio. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.AudioClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=audio-classification&library=transformers.js) | -| [Audio-to-Audio](https://huggingface.co/tasks/audio-to-audio) | n/a | Generating audio from an input audio source. | โŒ | -| [Automatic Speech Recognition](https://huggingface.co/tasks/automatic-speech-recognition) | `automatic-speech-recognition` | Transcribing a given audio into text. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.AutomaticSpeechRecognitionPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&library=transformers.js) | -| [Text-to-Speech](https://huggingface.co/tasks/text-to-speech) | `text-to-speech` or `text-to-audio` | Generating natural-sounding speech given text input. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TextToAudioPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=text-to-audio&library=transformers.js) | - +| Task | ID | Description | Supported? | +| ----------------------------------------------------------------------------------------- | ----------------------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [Audio Classification](https://huggingface.co/tasks/audio-classification) | `audio-classification` | Assigning a label or class to a given audio. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.AudioClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=audio-classification&library=transformers.js) | +| [Audio-to-Audio](https://huggingface.co/tasks/audio-to-audio) | n/a | Generating audio from an input audio source. | โŒ | +| [Automatic Speech Recognition](https://huggingface.co/tasks/automatic-speech-recognition) | `automatic-speech-recognition` | Transcribing a given audio into text. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.AutomaticSpeechRecognitionPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&library=transformers.js) | +| [Text-to-Speech](https://huggingface.co/tasks/text-to-speech) | `text-to-speech` or `text-to-audio` | Generating natural-sounding speech given text input. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TextToAudioPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=text-to-audio&library=transformers.js) | #### Tabular -| Task | ID | Description | Supported? | -|--------------------------|----|-------------|------------| -| [Tabular Classification](https://huggingface.co/tasks/tabular-classification) | n/a | Classifying a target category (a group) based on set of attributes. | โŒ | -| [Tabular Regression](https://huggingface.co/tasks/tabular-regression) | n/a | Predicting a numerical value given a set of attributes. | โŒ | - +| Task | ID | Description | Supported? | +| ----------------------------------------------------------------------------- | --- | ------------------------------------------------------------------- | ---------- | +| [Tabular Classification](https://huggingface.co/tasks/tabular-classification) | n/a | Classifying a target category (a group) based on set of attributes. | โŒ | +| [Tabular Regression](https://huggingface.co/tasks/tabular-regression) | n/a | Predicting a numerical value given a set of attributes. | โŒ | #### Multimodal -| Task | ID | Description | Supported? | -|--------------------------|----|-------------|------------| -| [Document Question Answering](https://huggingface.co/tasks/document-question-answering) | `document-question-answering` | Answering questions on document images. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.DocumentQuestionAnsweringPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=document-question-answering&library=transformers.js) | -| [Feature Extraction](https://huggingface.co/tasks/feature-extraction) | `feature-extraction` | Transforming raw data into numerical features that can be processed while preserving the information in the original dataset. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js) | -| [Image-to-Text](https://huggingface.co/tasks/image-to-text) | `image-to-text` | Output text from a given image. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToTextPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-to-text&library=transformers.js) | -| [Text-to-Image](https://huggingface.co/tasks/text-to-image) | `text-to-image` | Generates images from input text. | โŒ | -| [Visual Question Answering](https://huggingface.co/tasks/visual-question-answering) | `visual-question-answering` | Answering open-ended questions based on an image. | โŒ | -| [Zero-Shot Audio Classification](https://huggingface.co/learn/audio-course/chapter4/classification_models#zero-shot-audio-classification) | `zero-shot-audio-classification` | Classifying audios into classes that are unseen during training. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotAudioClassificationPipeline)
[(models)](https://huggingface.co/models?other=zero-shot-audio-classification&library=transformers.js) | -| [Zero-Shot Image Classification](https://huggingface.co/tasks/zero-shot-image-classification) | `zero-shot-image-classification` | Classifying images into classes that are unseen during training. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotImageClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&library=transformers.js) | -| [Zero-Shot Object Detection](https://huggingface.co/tasks/zero-shot-object-detection) | `zero-shot-object-detection` | Identify objects of classes that are unseen during training. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotObjectDetectionPipeline)
[(models)](https://huggingface.co/models?other=zero-shot-object-detection&library=transformers.js) | - +| Task | ID | Description | Supported? | +| ----------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [Document Question Answering](https://huggingface.co/tasks/document-question-answering) | `document-question-answering` | Answering questions on document images. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.DocumentQuestionAnsweringPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=document-question-answering&library=transformers.js) | +| [Feature Extraction](https://huggingface.co/tasks/feature-extraction) | `feature-extraction` | Transforming raw data into numerical features that can be processed while preserving the information in the original dataset. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js) | +| [Image-to-Text](https://huggingface.co/tasks/image-to-text) | `image-to-text` | Output text from a given image. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToTextPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=image-to-text&library=transformers.js) | +| [Text-to-Image](https://huggingface.co/tasks/text-to-image) | `text-to-image` | Generates images from input text. | โŒ | +| [Visual Question Answering](https://huggingface.co/tasks/visual-question-answering) | `visual-question-answering` | Answering open-ended questions based on an image. | โŒ | +| [Zero-Shot Audio Classification](https://huggingface.co/learn/audio-course/chapter4/classification_models#zero-shot-audio-classification) | `zero-shot-audio-classification` | Classifying audios into classes that are unseen during training. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotAudioClassificationPipeline)
[(models)](https://huggingface.co/models?other=zero-shot-audio-classification&library=transformers.js) | +| [Zero-Shot Image Classification](https://huggingface.co/tasks/zero-shot-image-classification) | `zero-shot-image-classification` | Classifying images into classes that are unseen during training. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotImageClassificationPipeline)
[(models)](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&library=transformers.js) | +| [Zero-Shot Object Detection](https://huggingface.co/tasks/zero-shot-object-detection) | `zero-shot-object-detection` | Identify objects of classes that are unseen during training. | โœ… [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotObjectDetectionPipeline)
[(models)](https://huggingface.co/models?other=zero-shot-object-detection&library=transformers.js) | #### Reinforcement Learning -| Task | ID | Description | Supported? | -|--------------------------|----|-------------|------------| -| [Reinforcement Learning](https://huggingface.co/tasks/reinforcement-learning) | n/a | Learning from actions by interacting with an environment through trial and error and receiving rewards (negative or positive) as feedback. | โŒ | - - +| Task | ID | Description | Supported? | +| ----------------------------------------------------------------------------- | --- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------- | +| [Reinforcement Learning](https://huggingface.co/tasks/reinforcement-learning) | n/a | Learning from actions by interacting with an environment through trial and error and receiving rewards (negative or positive) as feedback. | โŒ | ### Models @@ -274,7 +262,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). -1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suรกrez*, Yoann Dupont, Laurent Romary, ร‰ric Villemonte de la Clergerie, Djamรฉ Seddah and Benoรฎt Sagot. +1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suรกrez\*, Yoann Dupont, Laurent Romary, ร‰ric Villemonte de la Clergerie, Djamรฉ Seddah and Benoรฎt Sagot. 1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. @@ -294,7 +282,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by Renรฉ Ranftl, Alexey Bochkovskiy, Vladlen Koltun. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. -1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. +1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. @@ -347,5 +335,3 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmรกn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. - - diff --git a/core/vendor/modules/@xenova/transformers/src/backends/onnx.js b/core/vendor/modules/@xenova/transformers/src/backends/onnx.js index 0bee3dce7..d9a742cd5 100644 --- a/core/vendor/modules/@xenova/transformers/src/backends/onnx.js +++ b/core/vendor/modules/@xenova/transformers/src/backends/onnx.js @@ -6,45 +6,46 @@ * So, we just import both packages, and use the appropriate one based on the environment: * - When running in node, we use `onnxruntime-node`. * - When running in the browser, we use `onnxruntime-web` (`onnxruntime-node` is not bundled). - * + * * This module is not directly exported, but can be accessed through the environment variables: * ```javascript * import { env } from '@xenova/transformers'; * console.log(env.backends.onnx); * ``` - * + * * @module backends/onnx */ // NOTE: Import order matters here. We need to import `onnxruntime-node` before `onnxruntime-web`. // In either case, we select the default export if it exists, otherwise we use the named export. -import * as ONNX_NODE from 'onnxruntime-node'; -import * as ONNX_WEB from 'onnxruntime-web'; +import * as ONNX_NODE from "onnxruntime-node"; +import * as ONNX_WEB from "onnxruntime-web"; /** @type {import('onnxruntime-web')} The ONNX runtime module. */ export let ONNX; export const executionProviders = [ - // 'webgpu', - 'wasm' + // 'webgpu', + "wasm", ]; -if (typeof process !== 'undefined' && process?.release?.name === 'node') { - // Running in a node-like environment. - ONNX = ONNX_NODE.default ?? ONNX_NODE; - - // Add `cpu` execution provider, with higher precedence that `wasm`. - executionProviders.unshift('cpu'); +if (typeof process !== "undefined" && process?.release?.name === "node") { + // Running in a node-like environment. + ONNX = ONNX_NODE.default ?? ONNX_NODE; + // Add `cpu` execution provider, with higher precedence that `wasm`. + executionProviders.unshift("cpu"); } else { - // Running in a browser-environment - ONNX = ONNX_WEB.default ?? ONNX_WEB; + // Running in a browser-environment + ONNX = ONNX_WEB.default ?? ONNX_WEB; - // SIMD for WebAssembly does not operate correctly in some recent versions of iOS (16.4.x). - // As a temporary fix, we disable it for now. - // For more information, see: https://github.com/microsoft/onnxruntime/issues/15644 - const isIOS = typeof navigator !== 'undefined' && /iP(hone|od|ad).+16_4.+AppleWebKit/.test(navigator.userAgent); - if (isIOS) { - ONNX.env.wasm.simd = false; - } + // SIMD for WebAssembly does not operate correctly in some recent versions of iOS (16.4.x). + // As a temporary fix, we disable it for now. + // For more information, see: https://github.com/microsoft/onnxruntime/issues/15644 + const isIOS = + typeof navigator !== "undefined" && + /iP(hone|od|ad).+16_4.+AppleWebKit/.test(navigator.userAgent); + if (isIOS) { + ONNX.env.wasm.simd = false; + } } diff --git a/core/vendor/modules/@xenova/transformers/src/configs.js b/core/vendor/modules/@xenova/transformers/src/configs.js index 4506d2d9c..d026a4c70 100644 --- a/core/vendor/modules/@xenova/transformers/src/configs.js +++ b/core/vendor/modules/@xenova/transformers/src/configs.js @@ -1,10 +1,9 @@ - /** * @file Helper module for using model configs. For more information, see the corresponding * [Python documentation](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoConfig). - * + * * **Example:** Load an `AutoConfig`. - * + * * ```javascript * import { AutoConfig } from '@xenova/transformers'; * let config = await AutoConfig.from_pretrained('bert-base-uncased'); @@ -23,19 +22,16 @@ * // ... * // } * ``` - * + * * @module configs */ -import { - getModelJSON, -} from './utils/hub.js'; +import { getModelJSON } from "./utils/hub.js"; /** * @typedef {import('./utils/hub.js').PretrainedOptions} PretrainedOptions */ - /** * Loads a config from the specified path. * @param {string} pretrained_model_name_or_path The path to the config directory. @@ -43,8 +39,13 @@ import { * @returns {Promise} A promise that resolves with information about the loaded config. */ async function loadConfig(pretrained_model_name_or_path, options) { - let info = await getModelJSON(pretrained_model_name_or_path, 'config.json', true, options); - return info; + let info = await getModelJSON( + pretrained_model_name_or_path, + "config.json", + true, + options, + ); + return info; } /** @@ -52,56 +53,60 @@ async function loadConfig(pretrained_model_name_or_path, options) { * [Python documentation](https://huggingface.co/docs/transformers/main/en/main_classes/configuration#transformers.PretrainedConfig). */ export class PretrainedConfig { - // NOTE: Typo in original + // NOTE: Typo in original - /** - * Create a new PreTrainedTokenizer instance. - * @param {Object} configJSON The JSON of the config. - */ - constructor(configJSON) { - this.model_type = null; - this.is_encoder_decoder = false; + /** + * Create a new PreTrainedTokenizer instance. + * @param {Object} configJSON The JSON of the config. + */ + constructor(configJSON) { + this.model_type = null; + this.is_encoder_decoder = false; - Object.assign(this, configJSON); - } + Object.assign(this, configJSON); + } - /** - * Loads a pre-trained config from the given `pretrained_model_name_or_path`. - * - * @param {string} pretrained_model_name_or_path The path to the pre-trained config. - * @param {PretrainedOptions} options Additional options for loading the config. - * @throws {Error} Throws an error if the config.json is not found in the `pretrained_model_name_or_path`. - * - * @returns {Promise} A new instance of the `PretrainedConfig` class. - */ - static async from_pretrained(pretrained_model_name_or_path, { - progress_callback = null, - config = null, - cache_dir = null, - local_files_only = false, - revision = 'main', - } = {}) { - - let data = config ?? await loadConfig(pretrained_model_name_or_path, { - progress_callback, - config, - cache_dir, - local_files_only, - revision, - }) - return new this(data); - } + /** + * Loads a pre-trained config from the given `pretrained_model_name_or_path`. + * + * @param {string} pretrained_model_name_or_path The path to the pre-trained config. + * @param {PretrainedOptions} options Additional options for loading the config. + * @throws {Error} Throws an error if the config.json is not found in the `pretrained_model_name_or_path`. + * + * @returns {Promise} A new instance of the `PretrainedConfig` class. + */ + static async from_pretrained( + pretrained_model_name_or_path, + { + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = "main", + } = {}, + ) { + let data = + config ?? + (await loadConfig(pretrained_model_name_or_path, { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + })); + return new this(data); + } } /** * Helper class which is used to instantiate pretrained configs with the `from_pretrained` function. - * + * * @example - * let config = await AutoConfig.from_pretrained('bert-base-uncased'); + * let config = await AutoConfig.from_pretrained('bert-base-uncased'); */ export class AutoConfig { - /** @type {PretrainedConfig.from_pretrained} */ - static async from_pretrained(...args) { - return PretrainedConfig.from_pretrained(...args); - } + /** @type {PretrainedConfig.from_pretrained} */ + static async from_pretrained(...args) { + return PretrainedConfig.from_pretrained(...args); + } } diff --git a/core/vendor/modules/@xenova/transformers/src/env.js b/core/vendor/modules/@xenova/transformers/src/env.js index 49ea43ca6..07b1851b0 100644 --- a/core/vendor/modules/@xenova/transformers/src/env.js +++ b/core/vendor/modules/@xenova/transformers/src/env.js @@ -1,37 +1,37 @@ /** * @file Module used to configure Transformers.js. - * + * * **Example:** Disable remote models. * ```javascript * import { env } from '@xenova/transformers'; * env.allowRemoteModels = false; * ``` - * + * * **Example:** Set local model path. * ```javascript * import { env } from '@xenova/transformers'; * env.localModelPath = '/path/to/local/models/'; * ``` - * + * * **Example:** Set cache directory. * ```javascript * import { env } from '@xenova/transformers'; * env.cacheDir = '/path/to/cache/directory/'; * ``` - * + * * @module env */ -import fs from 'fs'; -import path from 'path'; +import fs from "fs"; +import path from "path"; -import { ONNX } from './backends/onnx.js'; +import { ONNX } from "./backends/onnx.js"; const { env: onnx_env } = ONNX; -const VERSION = '2.14.0'; +const VERSION = "2.14.0"; // Check if various APIs are available (depends on environment) -const WEB_CACHE_AVAILABLE = typeof self !== 'undefined' && 'caches' in self; +const WEB_CACHE_AVAILABLE = typeof self !== "undefined" && "caches" in self; const FS_AVAILABLE = !isEmpty(fs); // check if file system is available const PATH_AVAILABLE = !isEmpty(path); // check if path is available @@ -43,23 +43,22 @@ const RUNNING_LOCALLY = FS_AVAILABLE && PATH_AVAILABLE; // Only used for environments with access to file system const DEFAULT_CACHE_DIR = RUNNING_LOCALLY - ? path.join(__dirname, '/.cache/') - : null; + ? path.join(__dirname, "/.cache/") + : null; // Set local model path, based on available APIs -const DEFAULT_LOCAL_MODEL_PATH = '/models/'; +const DEFAULT_LOCAL_MODEL_PATH = "/models/"; const localModelPath = RUNNING_LOCALLY - ? path.join(__dirname, DEFAULT_LOCAL_MODEL_PATH) - : DEFAULT_LOCAL_MODEL_PATH; + ? path.join(__dirname, DEFAULT_LOCAL_MODEL_PATH) + : DEFAULT_LOCAL_MODEL_PATH; // Set path to wasm files. This is needed when running in a web worker. // https://onnxruntime.ai/docs/api/js/interfaces/Env.WebAssemblyFlags.html#wasmPaths // We use remote wasm files by default to make it easier for newer users. // In practice, users should probably self-host the necessary .wasm files. onnx_env.wasm.wasmPaths = RUNNING_LOCALLY - ? path.join(__dirname, '/dist/') - : `https://cdn.jsdelivr.net/npm/@xenova/transformers@${VERSION}/dist/`; - + ? path.join(__dirname, "/dist/") + : `https://cdn.jsdelivr.net/npm/@xenova/transformers@${VERSION}/dist/`; /** * Global variable used to control execution. This provides users a simple way to configure Transformers.js. @@ -83,44 +82,42 @@ onnx_env.wasm.wasmPaths = RUNNING_LOCALLY * implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache */ export const env = { - /////////////////// Backends settings /////////////////// - backends: { - // onnxruntime-web/onnxruntime-node - onnx: onnx_env, + /////////////////// Backends settings /////////////////// + backends: { + // onnxruntime-web/onnxruntime-node + onnx: onnx_env, - // TensorFlow.js - tfjs: {}, - }, + // TensorFlow.js + tfjs: {}, + }, - __dirname, - version: VERSION, + __dirname, + version: VERSION, - /////////////////// Model settings /////////////////// - allowRemoteModels: true, - remoteHost: 'https://huggingface.co/', - remotePathTemplate: '{model}/resolve/{revision}/', + /////////////////// Model settings /////////////////// + allowRemoteModels: true, + remoteHost: "https://huggingface.co/", + remotePathTemplate: "{model}/resolve/{revision}/", - allowLocalModels: true, - localModelPath: localModelPath, - useFS: FS_AVAILABLE, + allowLocalModels: true, + localModelPath: localModelPath, + useFS: FS_AVAILABLE, - /////////////////// Cache settings /////////////////// - useBrowserCache: WEB_CACHE_AVAILABLE, + /////////////////// Cache settings /////////////////// + useBrowserCache: WEB_CACHE_AVAILABLE, - useFSCache: FS_AVAILABLE, - cacheDir: DEFAULT_CACHE_DIR, - - useCustomCache: false, - customCache: null, - ////////////////////////////////////////////////////// -} + useFSCache: FS_AVAILABLE, + cacheDir: DEFAULT_CACHE_DIR, + useCustomCache: false, + customCache: null, + ////////////////////////////////////////////////////// +}; /** * @param {Object} obj * @private */ function isEmpty(obj) { - return Object.keys(obj).length === 0; + return Object.keys(obj).length === 0; } - diff --git a/core/vendor/modules/@xenova/transformers/src/models.js b/core/vendor/modules/@xenova/transformers/src/models.js index 1c48030a4..777ac86b2 100644 --- a/core/vendor/modules/@xenova/transformers/src/models.js +++ b/core/vendor/modules/@xenova/transformers/src/models.js @@ -1,9 +1,8 @@ - /** * @file Definitions of all models available in Transformers.js. - * + * * **Example:** Load and run an `AutoModel`. - * + * * ```javascript * import { AutoModel, AutoTokenizer } from '@xenova/transformers'; * @@ -19,13 +18,13 @@ * // size: 183132, * // } * ``` - * + * * We also provide other `AutoModel`s (listed below), which you can use in the same way as the Python library. For example: - * + * * **Example:** Load and run an `AutoModelForSeq2SeqLM`. * ```javascript * import { AutoModelForSeq2SeqLM, AutoTokenizer } from '@xenova/transformers'; - * + * * let tokenizer = await AutoTokenizer.from_pretrained('Xenova/t5-small'); * let model = await AutoModelForSeq2SeqLM.from_pretrained('Xenova/t5-small'); * @@ -34,53 +33,48 @@ * let decoded = tokenizer.decode(outputs[0], { skip_special_tokens: true }); * // 'Ich liebe Transformatoren!' * ``` - * + * * @module models */ -import { - AutoConfig, -} from './configs.js'; +import { AutoConfig } from "./configs.js"; import { - Callable, - isIntegralNumber, - isTypedArray, - mergeArrays, -} from './utils/core.js'; + Callable, + isIntegralNumber, + isTypedArray, + mergeArrays, +} from "./utils/core.js"; + +import { getModelFile, getModelJSON } from "./utils/hub.js"; import { - getModelFile, - getModelJSON, -} from './utils/hub.js'; + ForcedBOSTokenLogitsProcessor, + ForcedEOSTokenLogitsProcessor, + ForceTokensLogitsProcessor, + GenerationConfig, + LogitsProcessorList, + MinLengthLogitsProcessor, + MinNewTokensLengthLogitsProcessor, + NoBadWordsLogitsProcessor, + NoRepeatNGramLogitsProcessor, + RepetitionPenaltyLogitsProcessor, + Sampler, + SuppressTokensAtBeginLogitsProcessor, + WhisperTimeStampLogitsProcessor, +} from "./utils/generation.js"; import { - ForcedBOSTokenLogitsProcessor, - ForcedEOSTokenLogitsProcessor, - ForceTokensLogitsProcessor, - GenerationConfig, - LogitsProcessorList, - MinLengthLogitsProcessor, - MinNewTokensLengthLogitsProcessor, - NoBadWordsLogitsProcessor, - NoRepeatNGramLogitsProcessor, - RepetitionPenaltyLogitsProcessor, - Sampler, - SuppressTokensAtBeginLogitsProcessor, - WhisperTimeStampLogitsProcessor, -} from './utils/generation.js'; + cat, + dynamicTimeWarping, + mean, + ones_like, + stack, + std_mean, + Tensor, +} from "./utils/tensor.js"; -import { - cat, - dynamicTimeWarping, - mean, - ones_like, - stack, - std_mean, - Tensor, -} from './utils/tensor.js'; - -import { executionProviders, ONNX } from './backends/onnx.js'; +import { executionProviders, ONNX } from "./backends/onnx.js"; // import { medianFilter } from './transformers.js'; const { InferenceSession, Tensor: ONNXTensor, env } = ONNX; @@ -89,16 +83,15 @@ const { InferenceSession, Tensor: ONNXTensor, env } = ONNX; ////////////////////////////////////////////////// // Model types: used internally const MODEL_TYPES = { - EncoderOnly: 0, - EncoderDecoder: 1, - Seq2Seq: 2, - Vision2Seq: 3, - DecoderOnly: 4, - MaskGeneration: 5, -} + EncoderOnly: 0, + EncoderDecoder: 1, + Seq2Seq: 2, + Vision2Seq: 3, + DecoderOnly: 4, + MaskGeneration: 5, +}; ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // Helper functions @@ -107,7 +100,6 @@ const MODEL_TYPE_MAPPING = new Map(); const MODEL_NAME_TO_CLASS_MAPPING = new Map(); const MODEL_CLASS_TO_NAME_MAPPING = new Map(); - /** * Constructs an InferenceSession using a model file located at the specified path. * @param {string} pretrained_model_name_or_path The path to the directory containing the model file. @@ -116,30 +108,39 @@ const MODEL_CLASS_TO_NAME_MAPPING = new Map(); * @returns {Promise} A Promise that resolves to an InferenceSession object. * @private */ -async function constructSession(pretrained_model_name_or_path, fileName, options) { - // TODO add option for user to force specify their desired execution provider - let modelFileName = `onnx/${fileName}${options.quantized ? '_quantized' : ''}.onnx`; - let buffer = await getModelFile(pretrained_model_name_or_path, modelFileName, true, options); +async function constructSession( + pretrained_model_name_or_path, + fileName, + options, +) { + // TODO add option for user to force specify their desired execution provider + let modelFileName = `onnx/${fileName}${options.quantized ? "_quantized" : ""}.onnx`; + let buffer = await getModelFile( + pretrained_model_name_or_path, + modelFileName, + true, + options, + ); - try { - return await InferenceSession.create(buffer, { - executionProviders, - }); - } catch (err) { - // If the execution provided was only wasm, throw the error - if (executionProviders.length === 1 && executionProviders[0] === 'wasm') { - throw err; - } - - console.warn(err); - console.warn( - 'Something went wrong during model construction (most likely a missing operation). ' + - 'Using `wasm` as a fallback. ' - ) - return await InferenceSession.create(buffer, { - executionProviders: ['wasm'] - }); + try { + return await InferenceSession.create(buffer, { + executionProviders, + }); + } catch (err) { + // If the execution provided was only wasm, throw the error + if (executionProviders.length === 1 && executionProviders[0] === "wasm") { + throw err; } + + console.warn(err); + console.warn( + "Something went wrong during model construction (most likely a missing operation). " + + "Using `wasm` as a fallback. ", + ); + return await InferenceSession.create(buffer, { + executionProviders: ["wasm"], + }); + } } /** @@ -151,41 +152,46 @@ async function constructSession(pretrained_model_name_or_path, fileName, options * @private */ function validateInputs(session, inputs) { - /** - * NOTE: Create either a shallow or deep copy based on `onnx.wasm.proxy` - * @type {Record} - */ - const checkedInputs = Object.create(null); - const missingInputs = []; - for (const inputName of session.inputNames) { - const tensor = inputs[inputName]; - // Rare case where one of the model's input names corresponds to a built-in - // object name (e.g., toString), which would cause a simple (!tensor) check to fail, - // because it's not undefined but a function. - if (!(tensor instanceof Tensor)) { - missingInputs.push(inputName); - continue; - } - // NOTE: When `env.wasm.proxy is true` the tensor is moved across the Worker - // boundary, transferring ownership to the worker and invalidating the tensor. - // So, in this case, we simply sacrifice a clone for it. - checkedInputs[inputName] = env.wasm.proxy ? tensor.clone() : tensor; - } - if (missingInputs.length > 0) { - throw new Error( - `An error occurred during model execution: "Missing the following inputs: ${missingInputs.join(', ')}.`); + /** + * NOTE: Create either a shallow or deep copy based on `onnx.wasm.proxy` + * @type {Record} + */ + const checkedInputs = Object.create(null); + const missingInputs = []; + for (const inputName of session.inputNames) { + const tensor = inputs[inputName]; + // Rare case where one of the model's input names corresponds to a built-in + // object name (e.g., toString), which would cause a simple (!tensor) check to fail, + // because it's not undefined but a function. + if (!(tensor instanceof Tensor)) { + missingInputs.push(inputName); + continue; } + // NOTE: When `env.wasm.proxy is true` the tensor is moved across the Worker + // boundary, transferring ownership to the worker and invalidating the tensor. + // So, in this case, we simply sacrifice a clone for it. + checkedInputs[inputName] = env.wasm.proxy ? tensor.clone() : tensor; + } + if (missingInputs.length > 0) { + throw new Error( + `An error occurred during model execution: "Missing the following inputs: ${missingInputs.join(", ")}.`, + ); + } - const numInputsProvided = Object.keys(inputs).length; - const numInputsNeeded = session.inputNames.length; - if (numInputsProvided > numInputsNeeded) { - // No missing inputs, but too many inputs were provided. - // Warn the user and ignore the extra inputs. - let ignored = Object.keys(inputs).filter(inputName => !session.inputNames.includes(inputName)); - console.warn(`WARNING: Too many inputs were provided (${numInputsProvided} > ${numInputsNeeded}). The following inputs will be ignored: "${ignored.join(', ')}".`); - } + const numInputsProvided = Object.keys(inputs).length; + const numInputsNeeded = session.inputNames.length; + if (numInputsProvided > numInputsNeeded) { + // No missing inputs, but too many inputs were provided. + // Warn the user and ignore the extra inputs. + let ignored = Object.keys(inputs).filter( + (inputName) => !session.inputNames.includes(inputName), + ); + console.warn( + `WARNING: Too many inputs were provided (${numInputsProvided} > ${numInputsNeeded}). The following inputs will be ignored: "${ignored.join(", ")}".`, + ); + } - return checkedInputs; + return checkedInputs; } /** @@ -193,25 +199,25 @@ function validateInputs(session, inputs) { * NOTE: `inputs` must contain at least the input names of the model. * - If additional inputs are passed, they will be ignored. * - If inputs are missing, an error will be thrown. - * + * * @param {InferenceSession} session The InferenceSession object to run. * @param {Object} inputs An object that maps input names to input tensors. * @returns {Promise} A Promise that resolves to an object that maps output names to output tensors. * @private */ async function sessionRun(session, inputs) { - const checkedInputs = validateInputs(session, inputs); - try { - // @ts-ignore - let output = await session.run(checkedInputs); - output = replaceTensors(output); - return output; - } catch (e) { - // This usually occurs when the inputs are of the wrong type. - console.error(`An error occurred during model execution: "${e}".`); - console.error('Inputs given to model:', checkedInputs); - throw e; - } + const checkedInputs = validateInputs(session, inputs); + try { + // @ts-ignore + let output = await session.run(checkedInputs); + output = replaceTensors(output); + return output; + } catch (e) { + // This usually occurs when the inputs are of the wrong type. + console.error(`An error occurred during model execution: "${e}".`); + console.error("Inputs given to model:", checkedInputs); + throw e; + } } /** @@ -221,17 +227,16 @@ async function sessionRun(session, inputs) { * @private */ function replaceTensors(obj) { - for (let prop in obj) { - if (obj[prop] instanceof ONNXTensor) { - obj[prop] = new Tensor(obj[prop]); - } else if (typeof obj[prop] === 'object') { - replaceTensors(obj[prop]); - } + for (let prop in obj) { + if (obj[prop] instanceof ONNXTensor) { + obj[prop] = new Tensor(obj[prop]); + } else if (typeof obj[prop] === "object") { + replaceTensors(obj[prop]); } - return obj; + } + return obj; } - /** * Converts an array or Tensor of integers to an int64 Tensor. * @param {Array|Tensor} items The input integers to be converted. @@ -240,31 +245,35 @@ function replaceTensors(obj) { * @private */ function toI64Tensor(items) { - if (items instanceof Tensor) { - return items; - } - // items is an array - if (items.length === 0) { - throw Error("items must be non-empty"); + if (items instanceof Tensor) { + return items; + } + // items is an array + if (items.length === 0) { + throw Error("items must be non-empty"); + } + + if (Array.isArray(items[0])) { + // batched + if (items.some((x) => x.length !== items[0].length)) { + throw Error( + "Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.", + ); } - if (Array.isArray(items[0])) { - // batched - if (items.some(x => x.length !== items[0].length)) { - throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.") - } - - return new Tensor('int64', - BigInt64Array.from(items.flat().map(x => BigInt(x))), - [items.length, items[0].length] - ); - } else { - //flat - return new Tensor('int64', - BigInt64Array.from(items.map(x => BigInt(x))), - [1, items.length] - ); - } + return new Tensor( + "int64", + BigInt64Array.from(items.flat().map((x) => BigInt(x))), + [items.length, items[0].length], + ); + } else { + //flat + return new Tensor( + "int64", + BigInt64Array.from(items.map((x) => BigInt(x))), + [1, items.length], + ); + } } /** @@ -275,27 +284,27 @@ function toI64Tensor(items) { * @private */ function prepareAttentionMask(self, tokens) { + // Prepare attention mask + let pad_token_id = self.config.pad_token_id ?? null; + let eos_token_id = self.config.eos_token_id ?? null; + if (isIntegralNumber(eos_token_id)) { + eos_token_id = [eos_token_id]; + } - // Prepare attention mask - let pad_token_id = self.config.pad_token_id ?? null; - let eos_token_id = self.config.eos_token_id ?? null; - if (isIntegralNumber(eos_token_id)) { - eos_token_id = [eos_token_id]; - } + let is_pad_token_in_inputs = tokens.indexOf(pad_token_id) !== -1; + let is_pad_token_not_equal_to_eos_token_id = + eos_token_id === null || !eos_token_id.includes(pad_token_id); - let is_pad_token_in_inputs = tokens.indexOf(pad_token_id) !== -1; - let is_pad_token_not_equal_to_eos_token_id = (eos_token_id === null) || !eos_token_id.includes(pad_token_id) - - if (is_pad_token_in_inputs && is_pad_token_not_equal_to_eos_token_id) { - let data = BigInt64Array.from( - // Note: != so that int matches bigint - // @ts-ignore - tokens.data.map(x => x != pad_token_id) - ) - return new Tensor('int64', data, tokens.dims) - } else { - return ones_like(tokens); - } + if (is_pad_token_in_inputs && is_pad_token_not_equal_to_eos_token_id) { + let data = BigInt64Array.from( + // Note: != so that int matches bigint + // @ts-ignore + tokens.data.map((x) => x != pad_token_id), + ); + return new Tensor("int64", data, tokens.dims); + } else { + return ones_like(tokens); + } } /** @@ -307,30 +316,31 @@ function prepareAttentionMask(self, tokens) { * @private */ function preparePositionIds(session, feeds, use_cache_branch) { - if (!session.inputNames.includes('position_ids')) return; + if (!session.inputNames.includes("position_ids")) return; - const data = new BigInt64Array(feeds.attention_mask.data.length); + const data = new BigInt64Array(feeds.attention_mask.data.length); - // Compute cumulative sum of the attention mask along the sequence length dimension - for (let i = 0; i < feeds.attention_mask.dims[0]; ++i) { - let start = i * feeds.attention_mask.dims[1]; - let sum = BigInt(0); - for (let j = 0; j < feeds.attention_mask.dims[1]; ++j) { - const index = start + j; - if (feeds.attention_mask.data[index] === 0n) { - data[index] = BigInt(1); - } else { // === 1n - data[index] = sum; - sum += feeds.attention_mask.data[index]; - } - } + // Compute cumulative sum of the attention mask along the sequence length dimension + for (let i = 0; i < feeds.attention_mask.dims[0]; ++i) { + let start = i * feeds.attention_mask.dims[1]; + let sum = BigInt(0); + for (let j = 0; j < feeds.attention_mask.dims[1]; ++j) { + const index = start + j; + if (feeds.attention_mask.data[index] === 0n) { + data[index] = BigInt(1); + } else { + // === 1n + data[index] = sum; + sum += feeds.attention_mask.data[index]; + } } + } - feeds.position_ids = new Tensor('int64', data, feeds.attention_mask.dims); + feeds.position_ids = new Tensor("int64", data, feeds.attention_mask.dims); - if (use_cache_branch) { - feeds.position_ids = feeds.position_ids.slice(null, -1).unsqueeze_(-1); - } + if (use_cache_branch) { + feeds.position_ids = feeds.position_ids.slice(null, -1).unsqueeze_(-1); + } } /** @@ -340,7 +350,7 @@ function preparePositionIds(session, feeds, use_cache_branch) { * @private */ function boolTensor(value) { - return new Tensor('bool', [value], [1]); + return new Tensor("bool", [value], [1]); } // JS doesn't support mixins, so we define some reused functions here, and allow "this" to be passed in @@ -352,38 +362,52 @@ function boolTensor(value) { * @private */ async function seq2seqForward(self, model_inputs) { + let { encoder_outputs, past_key_values } = model_inputs; - let { encoder_outputs, past_key_values } = model_inputs; + if (!encoder_outputs) { + // Encoder outputs are not given, so we must compute them. + encoder_outputs = (await encoderForward(self, model_inputs)) + .last_hidden_state; + } + let decoderFeeds = { + input_ids: model_inputs.decoder_input_ids, + encoder_hidden_states: encoder_outputs, + }; + const use_cache_branch = !!past_key_values; - if (!encoder_outputs) { - // Encoder outputs are not given, so we must compute them. - encoder_outputs = (await encoderForward(self, model_inputs)).last_hidden_state; - } - let decoderFeeds = { - input_ids: model_inputs.decoder_input_ids, - encoder_hidden_states: encoder_outputs, - }; - const use_cache_branch = !!past_key_values; + if (self.decoder_merged_session.inputNames.includes("use_cache_branch")) { + decoderFeeds.use_cache_branch = boolTensor(use_cache_branch); + } - if (self.decoder_merged_session.inputNames.includes('use_cache_branch')) { - decoderFeeds.use_cache_branch = boolTensor(use_cache_branch); - } + if ( + self.decoder_merged_session.inputNames.includes("encoder_attention_mask") + ) { + decoderFeeds.encoder_attention_mask = model_inputs.attention_mask; + } - if (self.decoder_merged_session.inputNames.includes('encoder_attention_mask')) { - decoderFeeds.encoder_attention_mask = model_inputs.attention_mask - } + preparePositionIds( + self.decoder_merged_session, + decoderFeeds, + use_cache_branch, + ); + self.addPastKeyValues(decoderFeeds, past_key_values); - preparePositionIds(self.decoder_merged_session, decoderFeeds, use_cache_branch); - self.addPastKeyValues(decoderFeeds, past_key_values); + const decoderResults = await sessionRun( + self.decoder_merged_session, + decoderFeeds, + ); + let logits = decoderResults.logits; + past_key_values = self.getPastKeyValues(decoderResults, past_key_values); - const decoderResults = await sessionRun(self.decoder_merged_session, decoderFeeds); - let logits = decoderResults.logits; - past_key_values = self.getPastKeyValues(decoderResults, past_key_values); + // Get cross attention and/or decoder attentions if they are present + const attns = self.getAttentions(decoderResults); - // Get cross attention and/or decoder attentions if they are present - const attns = self.getAttentions(decoderResults); - - return new Seq2SeqLMOutput({ logits, past_key_values, encoder_outputs, ...attns }); + return new Seq2SeqLMOutput({ + logits, + past_key_values, + encoder_outputs, + ...attns, + }); } /** @@ -395,54 +419,59 @@ async function seq2seqForward(self, model_inputs) { * @returns {Object[]} Array of beam search objects. * @private */ -function seq2seqStartBeams(self, inputTokenIds, generation_config, numOutputTokens) { - let beams = []; - let beamId = 0; +function seq2seqStartBeams( + self, + inputTokenIds, + generation_config, + numOutputTokens, +) { + let beams = []; + let beamId = 0; - // @ts-ignore - const requires_attention_mask = self.requires_attention_mask ?? true; + // @ts-ignore + const requires_attention_mask = self.requires_attention_mask ?? true; - // decoder_input_ids == output_token_ids - let decoder_input_ids = - generation_config.decoder_input_ids - ?? generation_config.decoder_start_token_id - ?? generation_config.bos_token_id - ?? generation_config.eos_token_id; + // decoder_input_ids == output_token_ids + let decoder_input_ids = + generation_config.decoder_input_ids ?? + generation_config.decoder_start_token_id ?? + generation_config.bos_token_id ?? + generation_config.eos_token_id; - // Support input as tensor or list - // TODO support batched decoder_input_ids - if (decoder_input_ids instanceof Tensor) { - decoder_input_ids = decoder_input_ids.tolist().flat(); - } else if (!Array.isArray(decoder_input_ids)) { - decoder_input_ids = [decoder_input_ids]; + // Support input as tensor or list + // TODO support batched decoder_input_ids + if (decoder_input_ids instanceof Tensor) { + decoder_input_ids = decoder_input_ids.tolist().flat(); + } else if (!Array.isArray(decoder_input_ids)) { + decoder_input_ids = [decoder_input_ids]; + } + + for (let tokens of inputTokenIds) { + // TODO: Improve + // Currently, just add back batch dimension. + // In future, allow for true parallel execution + tokens.dims = [1, ...tokens.dims]; + + // Create beam + let start = { + inputs: tokens, + encoder_outputs: null, + prev_model_outputs: null, + + output_token_ids: decoder_input_ids, + done: false, + score: 0, + id: beamId++, // assign unique id to beams + }; + + if (requires_attention_mask) { + start.attention_mask = prepareAttentionMask(self, tokens); } - for (let tokens of inputTokenIds) { - // TODO: Improve - // Currently, just add back batch dimension. - // In future, allow for true parallel execution - tokens.dims = [1, ...tokens.dims] + beams.push(start); + } - // Create beam - let start = { - inputs: tokens, - encoder_outputs: null, - prev_model_outputs: null, - - output_token_ids: decoder_input_ids, - done: false, - score: 0, - id: beamId++ // assign unique id to beams - } - - if (requires_attention_mask) { - start.attention_mask = prepareAttentionMask(self, tokens); - } - - beams.push(start); - } - - return beams; + return beams; } /** @@ -455,34 +484,34 @@ function seq2seqStartBeams(self, inputTokenIds, generation_config, numOutputToke * @private */ async function seq2seqRunBeam(self, beam) { - const input_name = self.main_input_name; + const input_name = self.main_input_name; - let decoder_input_ids = beam.output_token_ids; - if (beam.prev_model_outputs) { - // After the first step, `prev_model_outputs` won't be null. - // So, we cut decoder_input_ids if past is used - decoder_input_ids = decoder_input_ids.slice(-1); - } + let decoder_input_ids = beam.output_token_ids; + if (beam.prev_model_outputs) { + // After the first step, `prev_model_outputs` won't be null. + // So, we cut decoder_input_ids if past is used + decoder_input_ids = decoder_input_ids.slice(-1); + } - // 1. Prepare - let model_inputs = { - [input_name]: beam.inputs, - decoder_input_ids: toI64Tensor(decoder_input_ids), - encoder_outputs: beam.encoder_outputs, - past_key_values: beam.prev_model_outputs?.past_key_values, - } - if (beam.attention_mask) { - model_inputs.attention_mask = beam.attention_mask - } + // 1. Prepare + let model_inputs = { + [input_name]: beam.inputs, + decoder_input_ids: toI64Tensor(decoder_input_ids), + encoder_outputs: beam.encoder_outputs, + past_key_values: beam.prev_model_outputs?.past_key_values, + }; + if (beam.attention_mask) { + model_inputs.attention_mask = beam.attention_mask; + } - // 2. Run - let output = await self.forward(model_inputs); + // 2. Run + let output = await self.forward(model_inputs); - // 3. Update - beam.prev_model_outputs = output; - beam.encoder_outputs = output.encoder_outputs; + // 3. Update + beam.prev_model_outputs = output; + beam.encoder_outputs = output.encoder_outputs; - return output; + return output; } /** @@ -492,7 +521,7 @@ async function seq2seqRunBeam(self, beam) { * @private */ function seq2seqUpdatebeam(beam, newTokenId) { - beam.output_token_ids = [...beam.output_token_ids, newTokenId]; + beam.output_token_ids = [...beam.output_token_ids, newTokenId]; } /** @@ -503,23 +532,25 @@ function seq2seqUpdatebeam(beam, newTokenId) { * @private */ async function encoderForward(self, model_inputs) { - const encoderFeeds = Object.create(null); - for (const key of self.session.inputNames) { - encoderFeeds[key] = model_inputs[key]; - } - if (self.session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) { - // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it, - // but they weren't created by the tokenizer. - encoderFeeds.token_type_ids = new Tensor( - 'int64', - new BigInt64Array(encoderFeeds.input_ids.data.length), - encoderFeeds.input_ids.dims - ) - } - return await sessionRun(self.session, encoderFeeds); + const encoderFeeds = Object.create(null); + for (const key of self.session.inputNames) { + encoderFeeds[key] = model_inputs[key]; + } + if ( + self.session.inputNames.includes("token_type_ids") && + !encoderFeeds.token_type_ids + ) { + // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it, + // but they weren't created by the tokenizer. + encoderFeeds.token_type_ids = new Tensor( + "int64", + new BigInt64Array(encoderFeeds.input_ids.data.length), + encoderFeeds.input_ids.dims, + ); + } + return await sessionRun(self.session, encoderFeeds); } - /** * Forward pass of a decoder model. * @param {Object} self The decoder model. @@ -528,27 +559,27 @@ async function encoderForward(self, model_inputs) { * @private */ async function decoderForward(self, model_inputs) { - let { input_ids, past_key_values, attention_mask } = model_inputs; - let decoderFeeds = { - input_ids: input_ids, - attention_mask: attention_mask ?? prepareAttentionMask(self, input_ids), - } - const use_cache_branch = !!past_key_values; + let { input_ids, past_key_values, attention_mask } = model_inputs; + let decoderFeeds = { + input_ids: input_ids, + attention_mask: attention_mask ?? prepareAttentionMask(self, input_ids), + }; + const use_cache_branch = !!past_key_values; - if (self.session.inputNames.includes('use_cache_branch')) { - decoderFeeds.use_cache_branch = boolTensor(use_cache_branch); - } + if (self.session.inputNames.includes("use_cache_branch")) { + decoderFeeds.use_cache_branch = boolTensor(use_cache_branch); + } - preparePositionIds(self.session, decoderFeeds, use_cache_branch); + preparePositionIds(self.session, decoderFeeds, use_cache_branch); - self.addPastKeyValues(decoderFeeds, past_key_values); + self.addPastKeyValues(decoderFeeds, past_key_values); - let decoderResults = await sessionRun(self.session, decoderFeeds); + let decoderResults = await sessionRun(self.session, decoderFeeds); - let logits = decoderResults.logits; + let logits = decoderResults.logits; - past_key_values = self.getPastKeyValues(decoderResults, past_key_values); - return { logits, past_key_values }; + past_key_values = self.getPastKeyValues(decoderResults, past_key_values); + return { logits, past_key_values }; } /** @@ -561,44 +592,49 @@ async function decoderForward(self, model_inputs) { * @returns {Object[]} An array of beams initialized with the given inputs and parameters. * @private */ -function decoderStartBeams(self, inputTokenIds, generation_config, numOutputTokens, inputs_attention_mask) { - let beams = []; +function decoderStartBeams( + self, + inputTokenIds, + generation_config, + numOutputTokens, + inputs_attention_mask, +) { + let beams = []; - let beamId = 0; - for (let tokens of inputTokenIds) { - let output_token_ids = tokens.tolist().map(Number); + let beamId = 0; + for (let tokens of inputTokenIds) { + let output_token_ids = tokens.tolist().map(Number); - // TODO: Improve - // Currently, just add back batch dimension. - // In future, allow for true parallel execution - tokens.dims = [1, ...tokens.dims] + // TODO: Improve + // Currently, just add back batch dimension. + // In future, allow for true parallel execution + tokens.dims = [1, ...tokens.dims]; - let attn_mask; - if (inputs_attention_mask) { - attn_mask = inputs_attention_mask[beamId]; - attn_mask.dims = [1, ...attn_mask.dims] - - } else { - attn_mask = prepareAttentionMask(self, tokens) - } - - let start = { - input: tokens, - model_input_ids: tokens, - attention_mask: attn_mask, - prev_model_outputs: null, - - output_token_ids: output_token_ids, - num_output_tokens: numOutputTokens, - - done: false, - score: 0, - id: beamId++ // assign unique id to beams - } - - beams.push(start); + let attn_mask; + if (inputs_attention_mask) { + attn_mask = inputs_attention_mask[beamId]; + attn_mask.dims = [1, ...attn_mask.dims]; + } else { + attn_mask = prepareAttentionMask(self, tokens); } - return beams; + + let start = { + input: tokens, + model_input_ids: tokens, + attention_mask: attn_mask, + prev_model_outputs: null, + + output_token_ids: output_token_ids, + num_output_tokens: numOutputTokens, + + done: false, + score: 0, + id: beamId++, // assign unique id to beams + }; + + beams.push(start); + } + return beams; } /** @@ -615,26 +651,22 @@ function decoderStartBeams(self, inputTokenIds, generation_config, numOutputToke * @private */ async function decoderRunBeam(self, beam) { - let attnMaskData = new BigInt64Array(beam.output_token_ids.length).fill(1n) + let attnMaskData = new BigInt64Array(beam.output_token_ids.length).fill(1n); - // 1. Prepare - let model_inputs = { - input_ids: beam.model_input_ids, - attention_mask: new Tensor( - 'int64', - attnMaskData, - [1, attnMaskData.length] - ), - past_key_values: beam.prev_model_outputs?.past_key_values, - } + // 1. Prepare + let model_inputs = { + input_ids: beam.model_input_ids, + attention_mask: new Tensor("int64", attnMaskData, [1, attnMaskData.length]), + past_key_values: beam.prev_model_outputs?.past_key_values, + }; - // 2. Run - let output = await self.forward(model_inputs); + // 2. Run + let output = await self.forward(model_inputs); - // 3. Update - beam.prev_model_outputs = output; + // 3. Update + beam.prev_model_outputs = output; - return output; + return output; } /** @@ -644,8 +676,8 @@ async function decoderRunBeam(self, beam) { * @private */ function decoderUpdatebeam(beam, newTokenId) { - beam.output_token_ids = [...beam.output_token_ids, newTokenId]; - beam.model_input_ids = new Tensor('int64', [BigInt(newTokenId)], [1, 1]); + beam.output_token_ids = [...beam.output_token_ids, newTokenId]; + beam.model_input_ids = new Tensor("int64", [BigInt(newTokenId)], [1, 1]); } ////////////////////////////////////////////////// @@ -655,858 +687,1041 @@ function decoderUpdatebeam(beam, newTokenId) { * A base class for pre-trained models that provides the model configuration and an ONNX session. */ export class PreTrainedModel extends Callable { - main_input_name = 'input_ids'; + main_input_name = "input_ids"; - /** - * Creates a new instance of the `PreTrainedModel` class. - * @param {Object} config The model configuration. - * @param {any} session session for the model. - */ - constructor(config, session) { - super(); + /** + * Creates a new instance of the `PreTrainedModel` class. + * @param {Object} config The model configuration. + * @param {any} session session for the model. + */ + constructor(config, session) { + super(); - this.config = config; - this.session = session; + this.config = config; + this.session = session; - const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor); - const modelType = MODEL_TYPE_MAPPING.get(modelName); + const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor); + const modelType = MODEL_TYPE_MAPPING.get(modelName); - this.can_generate = false; - this._runBeam = null; - this._getStartBeams = null; - this._updateBeam = null; - this._forward = null; - if (modelType === MODEL_TYPES.DecoderOnly) { - this.can_generate = true; + this.can_generate = false; + this._runBeam = null; + this._getStartBeams = null; + this._updateBeam = null; + this._forward = null; + if (modelType === MODEL_TYPES.DecoderOnly) { + this.can_generate = true; - this._runBeam = decoderRunBeam; - this._getStartBeams = decoderStartBeams; - this._updateBeam = decoderUpdatebeam; - this._forward = decoderForward; + this._runBeam = decoderRunBeam; + this._getStartBeams = decoderStartBeams; + this._updateBeam = decoderUpdatebeam; + this._forward = decoderForward; + } else if ( + modelType === MODEL_TYPES.Seq2Seq || + modelType === MODEL_TYPES.Vision2Seq + ) { + this.can_generate = true; - } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) { - this.can_generate = true; + this._runBeam = seq2seqRunBeam; + this._getStartBeams = seq2seqStartBeams; + this._updateBeam = seq2seqUpdatebeam; + this._forward = seq2seqForward; + } else if (modelType === MODEL_TYPES.EncoderDecoder) { + this._forward = encoderForward; + } else { + // should be MODEL_TYPES.EncoderOnly + this._forward = encoderForward; + } + } - this._runBeam = seq2seqRunBeam; - this._getStartBeams = seq2seqStartBeams; - this._updateBeam = seq2seqUpdatebeam; - this._forward = seq2seqForward; + /** + * Disposes of all the ONNX sessions that were created during inference. + * @returns {Promise} An array of promises, one for each ONNX session that is being disposed. + * @todo Use https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/FinalizationRegistry + */ + async dispose() { + const promises = []; + for (let key of Object.keys(this)) { + const item = this[key]; + // @ts-ignore + if (item instanceof InferenceSession) { + promises.push(item.handler.dispose()); + } + } + return await Promise.all(promises); + } - } else if (modelType === MODEL_TYPES.EncoderDecoder) { - this._forward = encoderForward; + /** + * Instantiate one of the model classes of the library from a pretrained model. + * + * The model class to instantiate is selected based on the `model_type` property of the config object + * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) + * + * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: + * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + * user or organization name, like `dbmdz/bert-base-german-cased`. + * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`. + * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the model. + * + * @returns {Promise} A new instance of the `PreTrainedModel` class. + */ + static async from_pretrained( + pretrained_model_name_or_path, + { + quantized = true, + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = "main", + model_file_name = null, + } = {}, + ) { + let options = { + quantized, + progress_callback, + config, + cache_dir, + local_files_only, + revision, + model_file_name, + }; - } else { // should be MODEL_TYPES.EncoderOnly - this._forward = encoderForward; - } + const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this); + const modelType = MODEL_TYPE_MAPPING.get(modelName); + + let info; + if (modelType === MODEL_TYPES.DecoderOnly) { + info = await Promise.all([ + AutoConfig.from_pretrained(pretrained_model_name_or_path, options), + constructSession( + pretrained_model_name_or_path, + options.model_file_name ?? "decoder_model_merged", + options, + ), + getModelJSON( + pretrained_model_name_or_path, + "generation_config.json", + false, + options, + ), + ]); + } else if ( + modelType === MODEL_TYPES.Seq2Seq || + modelType === MODEL_TYPES.Vision2Seq + ) { + info = await Promise.all([ + AutoConfig.from_pretrained(pretrained_model_name_or_path, options), + constructSession( + pretrained_model_name_or_path, + "encoder_model", + options, + ), + constructSession( + pretrained_model_name_or_path, + "decoder_model_merged", + options, + ), + getModelJSON( + pretrained_model_name_or_path, + "generation_config.json", + false, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.MaskGeneration) { + info = await Promise.all([ + AutoConfig.from_pretrained(pretrained_model_name_or_path, options), + constructSession( + pretrained_model_name_or_path, + "vision_encoder", + options, + ), + constructSession( + pretrained_model_name_or_path, + "prompt_encoder_mask_decoder", + options, + ), + ]); + } else if (modelType === MODEL_TYPES.EncoderDecoder) { + info = await Promise.all([ + AutoConfig.from_pretrained(pretrained_model_name_or_path, options), + constructSession( + pretrained_model_name_or_path, + "encoder_model", + options, + ), + constructSession( + pretrained_model_name_or_path, + "decoder_model_merged", + options, + ), + ]); + } else { + // should be MODEL_TYPES.EncoderOnly + if (modelType !== MODEL_TYPES.EncoderOnly) { + console.warn( + `Model type for '${modelName}' not found, assuming encoder-only architecture. Please report this at https://github.com/xenova/transformers.js/issues/new/choose.`, + ); + } + info = await Promise.all([ + AutoConfig.from_pretrained(pretrained_model_name_or_path, options), + constructSession( + pretrained_model_name_or_path, + options.model_file_name ?? "model", + options, + ), + ]); } - /** - * Disposes of all the ONNX sessions that were created during inference. - * @returns {Promise} An array of promises, one for each ONNX session that is being disposed. - * @todo Use https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/FinalizationRegistry - */ - async dispose() { - const promises = []; - for (let key of Object.keys(this)) { - const item = this[key]; - // @ts-ignore - if (item instanceof InferenceSession) { - promises.push(item.handler.dispose()) - } - } - return await Promise.all(promises); + // @ts-ignore + return new this(...info); + } + + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Object containing input tensors + * @returns {Promise} Object containing output tensors + */ + async _call(model_inputs) { + return await this.forward(model_inputs); + } + + /** + * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method + * will be chosen based on the model type. + * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model. + * @returns {Promise} The output data from the model in the format specified in the ONNX model. + * @throws {Error} This method must be implemented in subclasses. + */ + async forward(model_inputs) { + return await this._forward(this, model_inputs); + } + + /** + * @param {import('./utils/generation.js').GenerationConfigType} generation_config + * @param {number} input_ids_seq_length The starting sequence length for the input ids. + * @returns {LogitsProcessorList} + * @private + */ + _get_logits_processor( + generation_config, + input_ids_seq_length, + // encoder_input_ids, TODO + // prefix_allowed_tokens_fn, TODO + logits_processor = null, + ) { + const processors = new LogitsProcessorList(); + + // if (generation_config.diversity_penalty !== null && generation_config.diversity_penalty > 0.0) { + // processors.push(new HammingDiversityLogitsProcessor( + // generation_config.diversity_penalty, + // generation_config.num_beams, + // generation_config.num_beam_groups + // )); + // } + + // if (generation_config.encoder_repetition_penalty !== null && generation_config.encoder_repetition_penalty !== 1.0) { + // processors.push(new EncoderRepetitionPenaltyLogitsProcessor( + // generation_config.encoder_repetition_penalty, + // encoder_input_ids + // )); + // } + + if ( + generation_config.repetition_penalty !== null && + generation_config.repetition_penalty !== 1.0 + ) { + processors.push( + new RepetitionPenaltyLogitsProcessor( + generation_config.repetition_penalty, + ), + ); } - /** - * Instantiate one of the model classes of the library from a pretrained model. - * - * The model class to instantiate is selected based on the `model_type` property of the config object - * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) - * - * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: - * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a - * user or organization name, like `dbmdz/bert-base-german-cased`. - * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`. - * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the model. - * - * @returns {Promise} A new instance of the `PreTrainedModel` class. - */ - static async from_pretrained(pretrained_model_name_or_path, { - quantized = true, - progress_callback = null, - config = null, - cache_dir = null, - local_files_only = false, - revision = 'main', - model_file_name = null, - } = {}) { + if ( + generation_config.no_repeat_ngram_size !== null && + generation_config.no_repeat_ngram_size > 0 + ) { + processors.push( + new NoRepeatNGramLogitsProcessor( + generation_config.no_repeat_ngram_size, + ), + ); + } - let options = { - quantized, - progress_callback, - config, - cache_dir, - local_files_only, - revision, - model_file_name, + // if (generation_config.encoder_no_repeat_ngram_size !== null && generation_config.encoder_no_repeat_ngram_size > 0) { + // if (this.config.is_encoder_decoder) { + // processors.push(new EncoderNoRepeatNGramLogitsProcessor( + // generation_config.encoder_no_repeat_ngram_size, + // encoder_input_ids + // )); + // } else { + // throw new Error("It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture"); + // } + // } + + if (generation_config.bad_words_ids !== null) { + processors.push( + new NoBadWordsLogitsProcessor( + generation_config.bad_words_ids, + generation_config.eos_token_id, + ), + ); + } + + if ( + generation_config.min_length !== null && + generation_config.eos_token_id !== null && + generation_config.min_length > 0 + ) { + processors.push( + new MinLengthLogitsProcessor( + generation_config.min_length, + generation_config.eos_token_id, + ), + ); + } + + if ( + generation_config.min_new_tokens !== null && + generation_config.eos_token_id !== null && + generation_config.min_new_tokens > 0 + ) { + processors.push( + new MinNewTokensLengthLogitsProcessor( + input_ids_seq_length, + generation_config.min_new_tokens, + generation_config.eos_token_id, + ), + ); + } + + // if (prefix_allowed_tokens_fn !== null) { + // processors.push(new PrefixConstrainedLogitsProcessor( + // prefix_allowed_tokens_fn, + // generation_config.num_beams / generation_config.num_beam_groups + // )); + // } + + if (generation_config.forced_bos_token_id !== null) { + processors.push( + new ForcedBOSTokenLogitsProcessor( + generation_config.forced_bos_token_id, + ), + ); + } + + if (generation_config.forced_eos_token_id !== null) { + processors.push( + new ForcedEOSTokenLogitsProcessor( + generation_config.max_length, + generation_config.forced_eos_token_id, + ), + ); + } + + // if (generation_config.remove_invalid_values === true) { + // processors.push(new InfNanRemoveLogitsProcessor()); + // } + + // if (generation_config.exponential_decay_length_penalty !== null) { + // processors.push(new ExponentialDecayLengthPenalty( + // generation_config.exponential_decay_length_penalty, + // generation_config.eos_token_id, + // input_ids_seq_length + // )); + // } + + // if (generation_config.suppress_tokens !== null) { + // processors.push(new SuppressTokensLogitsProcessor(generation_config.suppress_tokens)); + // } + + if (generation_config.begin_suppress_tokens !== null) { + let begin_index = + input_ids_seq_length > 1 || + generation_config.forced_bos_token_id === null + ? input_ids_seq_length + : input_ids_seq_length + 1; + + if (generation_config.forced_decoder_ids !== null) { + // generation starts after the last token that is forced + begin_index += + generation_config.forced_decoder_ids[ + generation_config.forced_decoder_ids.length - 1 + ][0]; + } + processors.push( + new SuppressTokensAtBeginLogitsProcessor( + generation_config.begin_suppress_tokens, + begin_index, + ), + ); + } + + if (generation_config.forced_decoder_ids !== null) { + processors.push( + new ForceTokensLogitsProcessor(generation_config.forced_decoder_ids), + ); + } + + if (logits_processor !== null) { + processors.extend(logits_processor); + } + + // `LogitNormalization` should always be the last logit processor, when present + // if (generation_config.renormalize_logits === true) { + // processors.push(new LogitNormalization()); + // } + + return processors; + } + + /** + * This function merges multiple generation configs together to form a final generation config to be used by the model for text generation. + * It first creates an empty `GenerationConfig` object, then it applies the model's own `generation_config` property to it. Finally, if a `generation_config` object was passed in the arguments, it overwrites the corresponding properties in the final config with those of the passed config object. + * @param {import('./utils/generation.js').GenerationConfigType} generation_config A `GenerationConfig` object containing generation parameters. + * @returns {import('./utils/generation.js').GenerationConfigType} The final generation config object to be used by the model for text generation. + */ + _get_generation_config(generation_config) { + // Create empty generation config (contains defaults) + // We pass `this.config` so that if `eos_token_id` or `bos_token_id` exist in the model's config, we will use them + let gen_config = new GenerationConfig(this.config); + + // Apply model's generation config, if it exists + if ("generation_config" in this) { + Object.assign(gen_config, this.generation_config); + } + + // Finally, use any generation config specified by the user + // when calling `generate` + if (generation_config !== null) { + Object.assign(gen_config, generation_config); + } + return gen_config; + } + + /** + * @typedef {import('./utils/maths.js').TypedArray} TypedArray + */ + + /** + * @typedef {{ sequences: Tensor, decoder_attentions: Tensor, cross_attentions: Tensor }} EncoderDecoderOutput + * @typedef {Object} DecoderOutput + * + * Generates text based on the given inputs and generation configuration using the model. + * @param {Tensor|Array|TypedArray} inputs An array of input token IDs. + * @param {Object|GenerationConfig|null} generation_config The generation configuration to use. If null, default configuration will be used. + * @param {Object|null} logits_processor An optional logits processor to use. If null, a new LogitsProcessorList instance will be created. + * @param {Object} options options + * @param {Object} [options.inputs_attention_mask=null] An optional attention mask for the inputs. + * @returns {Promise} An array of generated output sequences, where each sequence is an array of token IDs. + * @throws {Error} Throws an error if the inputs array is empty. + */ + async generate( + inputs, + generation_config = null, + logits_processor = null, + { inputs_attention_mask = null } = {}, + ) { + if (!this.can_generate) { + const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor); + let errorMessage = `The current model class (${modelName}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`; + + const modelType = this.config.model_type; + const possibleInfo = + MODEL_WITH_LM_HEAD_MAPPING_NAMES.get(modelType) ?? + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES.get(modelType) ?? + MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.get(modelType) ?? + // ?? MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES.get(modelType) // TODO + MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES.get(modelType); + + if (possibleInfo) { + // TODO: support multiple possible classes + errorMessage += ` Please use the following class instead: '${possibleInfo[0]}'`; + } + throw Error(errorMessage); + } + + if ( + !(inputs instanceof Tensor) && + !isTypedArray(inputs) && + !Array.isArray(inputs) + ) { + throw Error( + `\`inputs\` must be a Tensor, TypedArray, or Array, but is "${inputs.constructor.name}".`, + ); + } + + let input_ids_seq_length; + + // Prepare `input_ids` which will be used for auto-regressive generation + // TODO: Update to align with HF transformers' implementation + if (this.config.is_encoder_decoder) { + // Generating from the encoder outputs + input_ids_seq_length = 0; + } else { + input_ids_seq_length = + inputs instanceof Tensor ? inputs.dims.at(-1) : inputs.length; + + // decoder-only + if (input_ids_seq_length === 0) { + throw Error("Must supply a non-empty array of input token ids."); + } + } + + // Update generation config with defaults + generation_config = this._get_generation_config(generation_config); + + logits_processor = logits_processor ?? new LogitsProcessorList(); + + // Update logits processor + logits_processor = this._get_logits_processor( + generation_config, + input_ids_seq_length, + logits_processor, + ); + + /** @type {number[]} */ + let eos_token_ids = generation_config.eos_token_id; + if (eos_token_ids !== null && !Array.isArray(eos_token_ids)) { + eos_token_ids = [eos_token_ids]; + } + + // TODO implement early_stopping + // https://huggingface.co/blog/how-to-generate + + let numOutputTokens = 1; + const maxOutputTokens = + numOutputTokens + (generation_config.max_new_tokens ?? Infinity); + + // Only use max length if max_new_tokens is not provided + const useMaxLength = + Number.isInteger(generation_config.max_length) && + (generation_config.max_new_tokens ?? null) === null; + let sampler = Sampler.getSampler(generation_config); + + // @ts-ignore + let beams = this.getStartBeams( + inputs, + generation_config, + numOutputTokens, + inputs_attention_mask, + ); + + while (beams.some((x) => !x.done) && numOutputTokens < maxOutputTokens) { + let newest_beams = []; + for (let beam of beams) { + if (beam.done) { + // Add this beam back into the pool + newest_beams.push(beam); + continue; } - - const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this); - const modelType = MODEL_TYPE_MAPPING.get(modelName); - - let info; - if (modelType === MODEL_TYPES.DecoderOnly) { - info = await Promise.all([ - AutoConfig.from_pretrained(pretrained_model_name_or_path, options), - constructSession(pretrained_model_name_or_path, options.model_file_name ?? 'decoder_model_merged', options), - getModelJSON(pretrained_model_name_or_path, 'generation_config.json', false, options), - ]); - - } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) { - info = await Promise.all([ - AutoConfig.from_pretrained(pretrained_model_name_or_path, options), - constructSession(pretrained_model_name_or_path, 'encoder_model', options), - constructSession(pretrained_model_name_or_path, 'decoder_model_merged', options), - getModelJSON(pretrained_model_name_or_path, 'generation_config.json', false, options), - ]); - - } else if (modelType === MODEL_TYPES.MaskGeneration) { - info = await Promise.all([ - AutoConfig.from_pretrained(pretrained_model_name_or_path, options), - constructSession(pretrained_model_name_or_path, 'vision_encoder', options), - constructSession(pretrained_model_name_or_path, 'prompt_encoder_mask_decoder', options), - ]); - - } else if (modelType === MODEL_TYPES.EncoderDecoder) { - info = await Promise.all([ - AutoConfig.from_pretrained(pretrained_model_name_or_path, options), - constructSession(pretrained_model_name_or_path, 'encoder_model', options), - constructSession(pretrained_model_name_or_path, 'decoder_model_merged', options), - ]); - - } else { // should be MODEL_TYPES.EncoderOnly - if (modelType !== MODEL_TYPES.EncoderOnly) { - console.warn(`Model type for '${modelName}' not found, assuming encoder-only architecture. Please report this at https://github.com/xenova/transformers.js/issues/new/choose.`) - } - info = await Promise.all([ - AutoConfig.from_pretrained(pretrained_model_name_or_path, options), - constructSession(pretrained_model_name_or_path, options.model_file_name ?? 'model', options) - ]); + if ( + useMaxLength && + beam.output_token_ids.length >= generation_config.max_length + ) { + // Set this beam to done and add it back into the pool + beam.done = true; + newest_beams.push(beam); + continue; } // @ts-ignore - return new this(...info); + let output = await this.runBeam(beam); + + // add attentions/scores to beam only if user requested + if (generation_config.output_attentions) { + this.addAttentionsToBeam(beam, output); + } + if (generation_config.output_scores) { + // TODO add + } + + // Logits are of the form [batch_size, out_seq_length, vocab_size] + // In most cases, this will be [batch_size, 1, vocab_size] + // So, we select the last token's logits: + // (equivalent to `logits = outputs.logits[:, -1, :]`) + let logits = output.logits.slice(null, -1, null); + + // Apply logits processor + logits_processor(beam.output_token_ids, logits); + + let sampledTokens = sampler(logits); + for (let [newTokenId, logProb] of sampledTokens) { + // use previous beam as a starting point + let newBeam = { ...beam }; + + // update new beam + // @ts-ignore + this.updateBeam(newBeam, newTokenId); + + newBeam.score += logProb; + + if (eos_token_ids && eos_token_ids.includes(newTokenId)) { + newBeam.done = true; + } + + newest_beams.push(newBeam); + } + } + ++numOutputTokens; + + // Next, we get the best beams, per ID + newest_beams = this.groupBeams(newest_beams).map( + (group) => + group + .sort((a, b) => b.score - a.score) // sort by score + .slice(0, generation_config.num_beams), // remove outside beam width + ); + + // Flatten beams + beams = newest_beams.flat(); + + // Run callback + if (generation_config.callback_function) { + generation_config.callback_function(beams); + } } - /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Object containing input tensors - * @returns {Promise} Object containing output tensors - */ - async _call(model_inputs) { - return await this.forward(model_inputs); + // TODO: Ensure that we can return non-batched outputs + + const groupedBeams = this.groupBeams(beams); + + const getFlattened = (key) => + groupedBeams + .map((batch) => { + if (generation_config.num_return_sequences > 1) { + return batch + .slice(0, generation_config.num_return_sequences) + .map((x) => x[key]); + } else { + return [batch[0][key]]; + } + }) + .flat(); // Flatten across batches (depth=1) + + const sequences = getFlattened("output_token_ids"); // [1, seqLength] + + if (generation_config.return_dict_in_generate) { + // NOTE: `decoder_attentions` and `cross_attentions` should be: + // list (one element for each generated token) + // of list (one element for each layer of the decoder) + // of torch.FloatTensor of shape (batch_size, num_heads, generated_length, sequence_length) + // However, since we are only generating one batch at a time, they are of the form: + // list (batches) + // of list (one element for each generated token) + // of list (one element for each layer of the decoder) + // of torch.FloatTensor of shape (1, num_heads, generated_length, sequence_length) + // + // TODO: In future (when true parallelism, we should be able to return the correct shape) + + const decoder_attentions = getFlattened("decoder_attentions"); + const cross_attentions = getFlattened("cross_attentions"); + + return { + sequences, + + decoder_attentions, + cross_attentions, + }; + } else { + return sequences; + } + } + + /** + * Helper function to add attentions to beam + * @param {Object} beam + * @param {Object} output + * @private + */ + addAttentionsToBeam(beam, output) { + if (this.config.is_encoder_decoder) { + if (!output.cross_attentions || output.cross_attentions.length === 0) { + throw Error( + "`output_attentions` is true, but the model did not produce cross-attentions. " + + "This is most likely because the model was not exported with `output_attentions=True`.", + ); + } + if (!beam.cross_attentions) { + beam.cross_attentions = []; + } + beam.cross_attentions.push(output.cross_attentions); } - /** - * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method - * will be chosen based on the model type. - * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model. - * @returns {Promise} The output data from the model in the format specified in the ONNX model. - * @throws {Error} This method must be implemented in subclasses. - */ - async forward(model_inputs) { - return await this._forward(this, model_inputs); + if (!output.decoder_attentions || output.decoder_attentions.length === 0) { + throw Error( + "`output_attentions` is true, but the model did not produce decoder-attentions. " + + "This is most likely because the model was not exported with `output_attentions=True`.", + ); + } + if (!beam.decoder_attentions) { + beam.decoder_attentions = []; + } + beam.decoder_attentions.push(output.decoder_attentions); + } + + /** + * Groups an array of beam objects by their ids. + * + * @param {Array} beams The array of beam objects to group. + * @returns {Array} An array of arrays, where each inner array contains beam objects with the same id. + */ + groupBeams(beams) { + // Group beams by their ids + const groups = Object.create(null); + for (const obj of beams) { + if (groups[obj.id] === undefined) { + groups[obj.id] = [obj]; + } else { + groups[obj.id].push(obj); + } } - /** - * @param {import('./utils/generation.js').GenerationConfigType} generation_config - * @param {number} input_ids_seq_length The starting sequence length for the input ids. - * @returns {LogitsProcessorList} - * @private - */ - _get_logits_processor( - generation_config, - input_ids_seq_length, - // encoder_input_ids, TODO - // prefix_allowed_tokens_fn, TODO - logits_processor = null - ) { - const processors = new LogitsProcessorList(); + return Object.values(groups); + } - // if (generation_config.diversity_penalty !== null && generation_config.diversity_penalty > 0.0) { - // processors.push(new HammingDiversityLogitsProcessor( - // generation_config.diversity_penalty, - // generation_config.num_beams, - // generation_config.num_beam_groups - // )); - // } + /** + * Returns an object containing past key values from the given decoder results object. + * + * @param {Object} decoderResults The decoder results object. + * @param {Object} pastKeyValues The previous past key values. + * @returns {Object} An object containing past key values. + */ + getPastKeyValues(decoderResults, pastKeyValues) { + const pkvs = Object.create(null); - // if (generation_config.encoder_repetition_penalty !== null && generation_config.encoder_repetition_penalty !== 1.0) { - // processors.push(new EncoderRepetitionPenaltyLogitsProcessor( - // generation_config.encoder_repetition_penalty, - // encoder_input_ids - // )); - // } - - if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1.0) { - processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty)); - } - - if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) { - processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size)); - } - - // if (generation_config.encoder_no_repeat_ngram_size !== null && generation_config.encoder_no_repeat_ngram_size > 0) { - // if (this.config.is_encoder_decoder) { - // processors.push(new EncoderNoRepeatNGramLogitsProcessor( - // generation_config.encoder_no_repeat_ngram_size, - // encoder_input_ids - // )); - // } else { - // throw new Error("It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture"); - // } - // } - - if (generation_config.bad_words_ids !== null) { - processors.push(new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)); - } - - if (generation_config.min_length !== null && generation_config.eos_token_id !== null && generation_config.min_length > 0) { - processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id)); - } - - if (generation_config.min_new_tokens !== null && generation_config.eos_token_id !== null && generation_config.min_new_tokens > 0) { - processors.push(new MinNewTokensLengthLogitsProcessor( - input_ids_seq_length, - generation_config.min_new_tokens, - generation_config.eos_token_id - )); - } - - // if (prefix_allowed_tokens_fn !== null) { - // processors.push(new PrefixConstrainedLogitsProcessor( - // prefix_allowed_tokens_fn, - // generation_config.num_beams / generation_config.num_beam_groups - // )); - // } - - - if (generation_config.forced_bos_token_id !== null) { - processors.push(new ForcedBOSTokenLogitsProcessor(generation_config.forced_bos_token_id)); - } - - if (generation_config.forced_eos_token_id !== null) { - processors.push(new ForcedEOSTokenLogitsProcessor( - generation_config.max_length, - generation_config.forced_eos_token_id - )); - } - - // if (generation_config.remove_invalid_values === true) { - // processors.push(new InfNanRemoveLogitsProcessor()); - // } - - // if (generation_config.exponential_decay_length_penalty !== null) { - // processors.push(new ExponentialDecayLengthPenalty( - // generation_config.exponential_decay_length_penalty, - // generation_config.eos_token_id, - // input_ids_seq_length - // )); - // } - - // if (generation_config.suppress_tokens !== null) { - // processors.push(new SuppressTokensLogitsProcessor(generation_config.suppress_tokens)); - // } - - if (generation_config.begin_suppress_tokens !== null) { - let begin_index = (input_ids_seq_length > 1 || generation_config.forced_bos_token_id === null) - ? input_ids_seq_length - : input_ids_seq_length + 1; - - if (generation_config.forced_decoder_ids !== null) { - // generation starts after the last token that is forced - begin_index += generation_config.forced_decoder_ids[generation_config.forced_decoder_ids.length - 1][0]; - } - processors.push(new SuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index)); - } - - if (generation_config.forced_decoder_ids !== null) { - processors.push(new ForceTokensLogitsProcessor(generation_config.forced_decoder_ids)); - } - - if (logits_processor !== null) { - processors.extend(logits_processor) - } - - // `LogitNormalization` should always be the last logit processor, when present - // if (generation_config.renormalize_logits === true) { - // processors.push(new LogitNormalization()); - // } - - return processors; - } - - /** - * This function merges multiple generation configs together to form a final generation config to be used by the model for text generation. - * It first creates an empty `GenerationConfig` object, then it applies the model's own `generation_config` property to it. Finally, if a `generation_config` object was passed in the arguments, it overwrites the corresponding properties in the final config with those of the passed config object. - * @param {import('./utils/generation.js').GenerationConfigType} generation_config A `GenerationConfig` object containing generation parameters. - * @returns {import('./utils/generation.js').GenerationConfigType} The final generation config object to be used by the model for text generation. - */ - _get_generation_config(generation_config) { - // Create empty generation config (contains defaults) - // We pass `this.config` so that if `eos_token_id` or `bos_token_id` exist in the model's config, we will use them - let gen_config = new GenerationConfig(this.config); - - // Apply model's generation config, if it exists - if ('generation_config' in this) { - Object.assign(gen_config, this.generation_config); - } - - // Finally, use any generation config specified by the user - // when calling `generate` - if (generation_config !== null) { - Object.assign(gen_config, generation_config); - } - return gen_config; - } - - /** - * @typedef {import('./utils/maths.js').TypedArray} TypedArray - */ - - /** - * @typedef {{ sequences: Tensor, decoder_attentions: Tensor, cross_attentions: Tensor }} EncoderDecoderOutput - * @typedef {Object} DecoderOutput - * - * Generates text based on the given inputs and generation configuration using the model. - * @param {Tensor|Array|TypedArray} inputs An array of input token IDs. - * @param {Object|GenerationConfig|null} generation_config The generation configuration to use. If null, default configuration will be used. - * @param {Object|null} logits_processor An optional logits processor to use. If null, a new LogitsProcessorList instance will be created. - * @param {Object} options options - * @param {Object} [options.inputs_attention_mask=null] An optional attention mask for the inputs. - * @returns {Promise} An array of generated output sequences, where each sequence is an array of token IDs. - * @throws {Error} Throws an error if the inputs array is empty. - */ - async generate( - inputs, - generation_config = null, - logits_processor = null, - { - inputs_attention_mask = null - } = {}, - ) { - if (!this.can_generate) { - const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor); - let errorMessage = `The current model class (${modelName}) is not compatible with \`.generate()\`, as it doesn't have a language model head.` - - const modelType = this.config.model_type; - const possibleInfo = - MODEL_WITH_LM_HEAD_MAPPING_NAMES.get(modelType) - ?? MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES.get(modelType) - ?? MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.get(modelType) - // ?? MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES.get(modelType) // TODO - ?? MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES.get(modelType); - - if (possibleInfo) { - // TODO: support multiple possible classes - errorMessage += ` Please use the following class instead: '${possibleInfo[0]}'`; - } - throw Error(errorMessage); - } - - if (!(inputs instanceof Tensor) && !isTypedArray(inputs) && !Array.isArray(inputs)) { - throw Error(`\`inputs\` must be a Tensor, TypedArray, or Array, but is "${inputs.constructor.name}".`); - } - - let input_ids_seq_length; - - // Prepare `input_ids` which will be used for auto-regressive generation - // TODO: Update to align with HF transformers' implementation - if (this.config.is_encoder_decoder) { - // Generating from the encoder outputs - input_ids_seq_length = 0; + for (const name in decoderResults) { + if (name.startsWith("present")) { + let newName = name.replace("present", "past_key_values"); + if (pastKeyValues && name.includes("encoder")) { + // Optimization introduced by optimum to reuse past key values. So, we just replace the constant + // outputs with the previous past key values. + // https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704 + pkvs[newName] = pastKeyValues[newName]; } else { - input_ids_seq_length = inputs instanceof Tensor ? inputs.dims.at(-1) : inputs.length; - - // decoder-only - if (input_ids_seq_length === 0) { - throw Error("Must supply a non-empty array of input token ids.") - } + pkvs[newName] = decoderResults[name]; } + } + } + return pkvs; + } - // Update generation config with defaults - generation_config = this._get_generation_config(generation_config); + /** + * Returns an object containing attentions from the given decoder results object. + * + * @param {Object} decoderResults The decoder results object. + * @returns {Object} An object containing attentions. + */ + getAttentions(decoderResults) { + const attns = Object.create(null); - logits_processor = logits_processor ?? new LogitsProcessorList() - - // Update logits processor - logits_processor = this._get_logits_processor( - generation_config, - input_ids_seq_length, - logits_processor - ) - - /** @type {number[]} */ - let eos_token_ids = generation_config.eos_token_id; - if (eos_token_ids !== null && !Array.isArray(eos_token_ids)) { - eos_token_ids = [eos_token_ids]; + for (const attnName of ["cross_attentions", "decoder_attentions"]) { + const result = []; + for (const name in decoderResults) { + if (name.startsWith(attnName)) { + const index = name.split(".").pop(); + result[index] = decoderResults[name]; } + } + attns[attnName] = result; + } + return attns; + } - // TODO implement early_stopping - // https://huggingface.co/blog/how-to-generate + /** + * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values. + * + * @param {Object} decoderFeeds The decoder feeds object to add past key values to. + * @param {Object} pastKeyValues An object containing past key values. + */ + addPastKeyValues(decoderFeeds, pastKeyValues) { + if (pastKeyValues) { + Object.assign(decoderFeeds, pastKeyValues); + } else { + // TODO support batches (i.e., batch_size > 1) + const batch_size = 1; - let numOutputTokens = 1; - const maxOutputTokens = numOutputTokens + (generation_config.max_new_tokens ?? Infinity); - - // Only use max length if max_new_tokens is not provided - const useMaxLength = Number.isInteger(generation_config.max_length) && (generation_config.max_new_tokens ?? null) === null; - let sampler = Sampler.getSampler(generation_config); + // @ts-ignore + if (this.config.is_encoder_decoder && (this.add_encoder_pkv ?? true)) { + // @ts-ignore + let encoder_dims = [ + batch_size, + this.num_encoder_heads, + 0, + this.encoder_dim_kv, + ]; + // @ts-ignore + let decoder_dims = [ + batch_size, + this.num_decoder_heads, + 0, + this.decoder_dim_kv, + ]; + // @ts-ignore + for (let i = 0; i < this.num_decoder_layers; ++i) { + decoderFeeds[`past_key_values.${i}.encoder.key`] = new Tensor( + "float32", + [], + encoder_dims, + ); + decoderFeeds[`past_key_values.${i}.encoder.value`] = new Tensor( + "float32", + [], + encoder_dims, + ); + decoderFeeds[`past_key_values.${i}.decoder.key`] = new Tensor( + "float32", + [], + decoder_dims, + ); + decoderFeeds[`past_key_values.${i}.decoder.value`] = new Tensor( + "float32", + [], + decoder_dims, + ); + } + } else if (this.config.model_type === "falcon") { + // NOTE: Custom implementation for Falcon + // @ts-ignore + let dims = [batch_size * this.num_heads, 0, this.dim_kv]; + // @ts-ignore + for (let i = 0; i < this.num_layers; ++i) { + decoderFeeds[`past_key_values.${i}.key`] = new Tensor( + "float32", + [], + dims, + ); + decoderFeeds[`past_key_values.${i}.value`] = new Tensor( + "float32", + [], + dims, + ); + } + } else if (this.config.multi_query) { + // e.g., for `gpt_bigcode` + // @ts-ignore + let dims = [batch_size * this.num_heads, 0, 2 * this.dim_kv]; + // @ts-ignore + for (let i = 0; i < this.num_layers; ++i) { + decoderFeeds[`past_key_values.${i}.key_value`] = new Tensor( + "float32", + [], + dims, + ); + } + } else if (this.config.model_type === "bloom") { + // NOTE: Custom implementation for Bloom // @ts-ignore - let beams = this.getStartBeams(inputs, generation_config, numOutputTokens, inputs_attention_mask); - - while (beams.some(x => !x.done) && numOutputTokens < maxOutputTokens) { - let newest_beams = []; - for (let beam of beams) { - if (beam.done) { - // Add this beam back into the pool - newest_beams.push(beam); - continue - } - if (useMaxLength && beam.output_token_ids.length >= generation_config.max_length) { - // Set this beam to done and add it back into the pool - beam.done = true; - newest_beams.push(beam); - continue - } - - // @ts-ignore - let output = await this.runBeam(beam); - - // add attentions/scores to beam only if user requested - if (generation_config.output_attentions) { - this.addAttentionsToBeam(beam, output); - } - if (generation_config.output_scores) { - // TODO add - } - - // Logits are of the form [batch_size, out_seq_length, vocab_size] - // In most cases, this will be [batch_size, 1, vocab_size] - // So, we select the last token's logits: - // (equivalent to `logits = outputs.logits[:, -1, :]`) - let logits = output.logits.slice(null, -1, null); - - // Apply logits processor - logits_processor(beam.output_token_ids, logits); - - let sampledTokens = sampler(logits); - for (let [newTokenId, logProb] of sampledTokens) { - // use previous beam as a starting point - let newBeam = { ...beam }; - - // update new beam - // @ts-ignore - this.updateBeam(newBeam, newTokenId); - - newBeam.score += logProb; - - if (eos_token_ids && eos_token_ids.includes(newTokenId)) { - newBeam.done = true; - } - - newest_beams.push(newBeam); - } - } - ++numOutputTokens; - - // Next, we get the best beams, per ID - newest_beams = this.groupBeams(newest_beams).map( - group => group - .sort((a, b) => b.score - a.score) // sort by score - .slice(0, generation_config.num_beams) // remove outside beam width - ); - - // Flatten beams - beams = newest_beams.flat(); - - // Run callback - if (generation_config.callback_function) { - generation_config.callback_function(beams); - } + let keyDims = [batch_size * this.num_heads, this.dim_kv, 0]; // [batch_size x num_heads,64,past_sequence_length] + // @ts-ignore + let valueDims = [batch_size * this.num_heads, 0, this.dim_kv]; // [batch_size x num_heads,past_sequence_length,64] + // @ts-ignore + for (let i = 0; i < this.num_layers; ++i) { + decoderFeeds[`past_key_values.${i}.key`] = new Tensor( + "float32", + [], + keyDims, + ); + decoderFeeds[`past_key_values.${i}.value`] = new Tensor( + "float32", + [], + valueDims, + ); } - - // TODO: Ensure that we can return non-batched outputs - - const groupedBeams = this.groupBeams(beams); - - const getFlattened = (key) => groupedBeams.map( - batch => { - if (generation_config.num_return_sequences > 1) { - return batch.slice(0, generation_config.num_return_sequences).map(x => x[key]); - } else { - return [batch[0][key]]; - } - } - ).flat(); // Flatten across batches (depth=1) - - const sequences = getFlattened('output_token_ids'); // [1, seqLength] - - if (generation_config.return_dict_in_generate) { - // NOTE: `decoder_attentions` and `cross_attentions` should be: - // list (one element for each generated token) - // of list (one element for each layer of the decoder) - // of torch.FloatTensor of shape (batch_size, num_heads, generated_length, sequence_length) - // However, since we are only generating one batch at a time, they are of the form: - // list (batches) - // of list (one element for each generated token) - // of list (one element for each layer of the decoder) - // of torch.FloatTensor of shape (1, num_heads, generated_length, sequence_length) - // - // TODO: In future (when true parallelism, we should be able to return the correct shape) - - const decoder_attentions = getFlattened('decoder_attentions'); - const cross_attentions = getFlattened('cross_attentions'); - - return { - sequences, - - decoder_attentions, - cross_attentions, - } - } else { - return sequences; + } else { + // Decoder-only + // @ts-ignore + let dims = [batch_size, this.num_heads, 0, this.dim_kv]; + // @ts-ignore + for (let i = 0; i < this.num_layers; ++i) { + decoderFeeds[`past_key_values.${i}.key`] = new Tensor( + "float32", + [], + dims, + ); + decoderFeeds[`past_key_values.${i}.value`] = new Tensor( + "float32", + [], + dims, + ); } + } } + } - /** - * Helper function to add attentions to beam - * @param {Object} beam - * @param {Object} output - * @private - */ - addAttentionsToBeam(beam, output) { - if (this.config.is_encoder_decoder) { - if (!output.cross_attentions || output.cross_attentions.length === 0) { - throw Error( - "`output_attentions` is true, but the model did not produce cross-attentions. " + - "This is most likely because the model was not exported with `output_attentions=True`." - ) - } - if (!beam.cross_attentions) { - beam.cross_attentions = []; - } - beam.cross_attentions.push(output.cross_attentions); - } + /** + * Initializes and returns the beam for text generation task + * @param {Tensor} inputTokenIds The input token ids. + * @param {Object} generation_config The generation config. + * @param {number} numOutputTokens The number of tokens to be generated. + * @param {Tensor} inputs_attention_mask Optional input attention mask. + * @returns {any} A Beam object representing the initialized beam. + * @private + */ + getStartBeams( + inputTokenIds, + generation_config, + numOutputTokens, + inputs_attention_mask, + ) { + return this._getStartBeams( + this, + inputTokenIds, + generation_config, + numOutputTokens, + inputs_attention_mask, + ); + } - if (!output.decoder_attentions || output.decoder_attentions.length === 0) { - throw Error( - "`output_attentions` is true, but the model did not produce decoder-attentions. " + - "This is most likely because the model was not exported with `output_attentions=True`." - ) - } - if (!beam.decoder_attentions) { - beam.decoder_attentions = []; - } - beam.decoder_attentions.push(output.decoder_attentions); - } + /** + * Runs a single step of the beam search generation algorithm. + * @param {any} beam The current beam being generated. + * @returns {Promise} The updated beam after a single generation step. + * @private + */ + async runBeam(beam) { + return await this._runBeam(this, beam); + } - /** - * Groups an array of beam objects by their ids. - * - * @param {Array} beams The array of beam objects to group. - * @returns {Array} An array of arrays, where each inner array contains beam objects with the same id. - */ - groupBeams(beams) { - // Group beams by their ids - const groups = Object.create(null); - for (const obj of beams) { - if (groups[obj.id] === undefined) { - groups[obj.id] = [obj]; - } else { - groups[obj.id].push(obj); - } - } - - return Object.values(groups); - } - - /** - * Returns an object containing past key values from the given decoder results object. - * - * @param {Object} decoderResults The decoder results object. - * @param {Object} pastKeyValues The previous past key values. - * @returns {Object} An object containing past key values. - */ - getPastKeyValues(decoderResults, pastKeyValues) { - - const pkvs = Object.create(null); - - for (const name in decoderResults) { - if (name.startsWith('present')) { - let newName = name.replace('present', 'past_key_values'); - - if (pastKeyValues && name.includes('encoder')) { - // Optimization introduced by optimum to reuse past key values. So, we just replace the constant - // outputs with the previous past key values. - // https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704 - pkvs[newName] = pastKeyValues[newName]; - } else { - pkvs[newName] = decoderResults[name]; - } - } - } - return pkvs; - } - - /** - * Returns an object containing attentions from the given decoder results object. - * - * @param {Object} decoderResults The decoder results object. - * @returns {Object} An object containing attentions. - */ - getAttentions(decoderResults) { - const attns = Object.create(null); - - for (const attnName of ['cross_attentions', 'decoder_attentions']) { - const result = []; - for (const name in decoderResults) { - if (name.startsWith(attnName)) { - const index = name.split('.').pop() - result[index] = decoderResults[name]; - } - } - attns[attnName] = result; - } - return attns; - } - - /** - * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values. - * - * @param {Object} decoderFeeds The decoder feeds object to add past key values to. - * @param {Object} pastKeyValues An object containing past key values. - */ - addPastKeyValues(decoderFeeds, pastKeyValues) { - if (pastKeyValues) { - Object.assign(decoderFeeds, pastKeyValues) - } else { - // TODO support batches (i.e., batch_size > 1) - const batch_size = 1; - - // @ts-ignore - if (this.config.is_encoder_decoder && (this.add_encoder_pkv ?? true)) { - // @ts-ignore - let encoder_dims = [batch_size, this.num_encoder_heads, 0, this.encoder_dim_kv]; - // @ts-ignore - let decoder_dims = [batch_size, this.num_decoder_heads, 0, this.decoder_dim_kv]; - // @ts-ignore - for (let i = 0; i < this.num_decoder_layers; ++i) { - decoderFeeds[`past_key_values.${i}.encoder.key`] = new Tensor('float32', [], encoder_dims) - decoderFeeds[`past_key_values.${i}.encoder.value`] = new Tensor('float32', [], encoder_dims) - decoderFeeds[`past_key_values.${i}.decoder.key`] = new Tensor('float32', [], decoder_dims) - decoderFeeds[`past_key_values.${i}.decoder.value`] = new Tensor('float32', [], decoder_dims) - } - } else if (this.config.model_type === 'falcon') { - // NOTE: Custom implementation for Falcon - // @ts-ignore - let dims = [batch_size * this.num_heads, 0, this.dim_kv] - // @ts-ignore - for (let i = 0; i < this.num_layers; ++i) { - decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], dims) - decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], dims) - } - } else if (this.config.multi_query) { // e.g., for `gpt_bigcode` - // @ts-ignore - let dims = [batch_size * this.num_heads, 0, 2 * this.dim_kv] - // @ts-ignore - for (let i = 0; i < this.num_layers; ++i) { - decoderFeeds[`past_key_values.${i}.key_value`] = new Tensor('float32', [], dims) - } - } else if (this.config.model_type === 'bloom') { - // NOTE: Custom implementation for Bloom - - // @ts-ignore - let keyDims = [batch_size * this.num_heads, this.dim_kv, 0] // [batch_size x num_heads,64,past_sequence_length] - // @ts-ignore - let valueDims = [batch_size * this.num_heads, 0, this.dim_kv] // [batch_size x num_heads,past_sequence_length,64] - // @ts-ignore - for (let i = 0; i < this.num_layers; ++i) { - decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], keyDims) - decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], valueDims) - } - } else { // Decoder-only - // @ts-ignore - let dims = [batch_size, this.num_heads, 0, this.dim_kv] - // @ts-ignore - for (let i = 0; i < this.num_layers; ++i) { - decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], dims) - decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], dims) - } - } - } - } - - /** - * Initializes and returns the beam for text generation task - * @param {Tensor} inputTokenIds The input token ids. - * @param {Object} generation_config The generation config. - * @param {number} numOutputTokens The number of tokens to be generated. - * @param {Tensor} inputs_attention_mask Optional input attention mask. - * @returns {any} A Beam object representing the initialized beam. - * @private - */ - getStartBeams(inputTokenIds, generation_config, numOutputTokens, inputs_attention_mask) { - return this._getStartBeams(this, inputTokenIds, generation_config, numOutputTokens, inputs_attention_mask) - } - - /** - * Runs a single step of the beam search generation algorithm. - * @param {any} beam The current beam being generated. - * @returns {Promise} The updated beam after a single generation step. - * @private - */ - async runBeam(beam) { - return await this._runBeam(this, beam); - } - - /** - * Update a beam with a new token ID. - * @param {Object} beam The beam to update. - * @param {number} newTokenId The new token ID to add to the beam's output. - * @private - */ - updateBeam(beam, newTokenId) { - return this._updateBeam(beam, newTokenId); - } + /** + * Update a beam with a new token ID. + * @param {Object} beam The beam to update. + * @param {number} newTokenId The new token ID to add to the beam's output. + * @private + */ + updateBeam(beam, newTokenId) { + return this._updateBeam(beam, newTokenId); + } } ////////////////////////////////////////////////// // Base model output class -export class ModelOutput { } +export class ModelOutput {} /** * Base class for model's outputs, with potential hidden states and attentions. */ export class BaseModelOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.last_hidden_state Sequence of hidden-states at the output of the last layer of the model. - * @param {Tensor} [output.hidden_states] Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - * @param {Tensor} [output.attentions] Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - */ - constructor({ last_hidden_state, hidden_states = null, attentions = null }) { - super(); - this.last_hidden_state = last_hidden_state; - this.hidden_states = hidden_states; - this.attentions = attentions; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.last_hidden_state Sequence of hidden-states at the output of the last layer of the model. + * @param {Tensor} [output.hidden_states] Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + * @param {Tensor} [output.attentions] Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + */ + constructor({ last_hidden_state, hidden_states = null, attentions = null }) { + super(); + this.last_hidden_state = last_hidden_state; + this.hidden_states = hidden_states; + this.attentions = attentions; + } } ////////////////////////////////////////////////// // Bert models -export class BertPreTrainedModel extends PreTrainedModel { } -export class BertModel extends BertPreTrainedModel { } +export class BertPreTrainedModel extends PreTrainedModel {} +export class BertModel extends BertPreTrainedModel {} /** * BertForMaskedLM is a class representing a BERT model for masked language modeling. */ export class BertForMaskedLM extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * BertForSequenceClassification is a class representing a BERT model for sequence classification. */ export class BertForSequenceClassification extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * BertForTokenClassification is a class representing a BERT model for token classification. */ export class BertForTokenClassification extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** * BertForQuestionAnswering is a class representing a BERT model for question answering. */ export class BertForQuestionAnswering extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // RoFormer models -export class RoFormerPreTrainedModel extends PreTrainedModel { } +export class RoFormerPreTrainedModel extends PreTrainedModel {} /** * The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top. */ -export class RoFormerModel extends RoFormerPreTrainedModel { } +export class RoFormerModel extends RoFormerPreTrainedModel {} /** * RoFormer Model with a `language modeling` head on top. */ export class RoFormerForMaskedLM extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class RoFormerForSequenceClassification extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** @@ -1514,15 +1729,15 @@ export class RoFormerForSequenceClassification extends RoFormerPreTrainedModel { * e.g. for Named-Entity-Recognition (NER) tasks. */ export class RoFormerForTokenClassification extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** @@ -1530,56 +1745,56 @@ export class RoFormerForTokenClassification extends RoFormerPreTrainedModel { * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). */ export class RoFormerForQuestionAnswering extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } // TODO: Add RoFormerForCausalLM and RoFormerForMultipleChoice ////////////////////////////////////////////////// ////////////////////////////////////////////////// // ConvBert models -export class ConvBertPreTrainedModel extends PreTrainedModel { } +export class ConvBertPreTrainedModel extends PreTrainedModel {} /** * The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top. */ -export class ConvBertModel extends ConvBertPreTrainedModel { } +export class ConvBertModel extends ConvBertPreTrainedModel {} /** * ConvBERT Model with a language modeling head on top. */ export class ConvBertForMaskedLM extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class ConvBertForSequenceClassification extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** @@ -1587,15 +1802,15 @@ export class ConvBertForSequenceClassification extends ConvBertPreTrainedModel { * e.g. for Named-Entity-Recognition (NER) tasks. */ export class ConvBertForTokenClassification extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** @@ -1603,73 +1818,72 @@ export class ConvBertForTokenClassification extends ConvBertPreTrainedModel { * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`) */ export class ConvBertForQuestionAnswering extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // Electra models -export class ElectraPreTrainedModel extends PreTrainedModel { } +export class ElectraPreTrainedModel extends PreTrainedModel {} /** * The bare Electra Model transformer outputting raw hidden-states without any specific head on top. * Identical to the BERT model except that it uses an additional linear layer between the embedding * layer and the encoder if the hidden size and embedding size are different. */ -export class ElectraModel extends ElectraPreTrainedModel { } +export class ElectraModel extends ElectraPreTrainedModel {} // TODO add ElectraForPreTraining /** * Electra model with a language modeling head on top. */ export class ElectraForMaskedLM extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class ElectraForSequenceClassification extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * Electra model with a token classification head on top. */ export class ElectraForTokenClassification extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** @@ -1677,141 +1891,140 @@ export class ElectraForTokenClassification extends ElectraPreTrainedModel { * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). */ export class ElectraForQuestionAnswering extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // CamemBERT models -export class CamembertPreTrainedModel extends PreTrainedModel { } +export class CamembertPreTrainedModel extends PreTrainedModel {} /** * The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top. */ -export class CamembertModel extends CamembertPreTrainedModel { } +export class CamembertModel extends CamembertPreTrainedModel {} /** * CamemBERT Model with a `language modeling` head on top. */ export class CamembertForMaskedLM extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. */ export class CamembertForSequenceClassification extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. */ export class CamembertForTokenClassification extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** * CamemBERT Model with a span classification head on top for extractive question-answering tasks */ export class CamembertForQuestionAnswering extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // DeBERTa models -export class DebertaPreTrainedModel extends PreTrainedModel { } +export class DebertaPreTrainedModel extends PreTrainedModel {} /** * The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top. */ -export class DebertaModel extends DebertaPreTrainedModel { } +export class DebertaModel extends DebertaPreTrainedModel {} /** * DeBERTa Model with a `language modeling` head on top. */ export class DebertaForMaskedLM extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class DebertaForSequenceClassification extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. */ export class DebertaForTokenClassification extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** @@ -1819,70 +2032,70 @@ export class DebertaForTokenClassification extends DebertaPreTrainedModel { * layers on top of the hidden-states output to compute `span start logits` and `span end logits`). */ export class DebertaForQuestionAnswering extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // DeBERTa-v2 models -export class DebertaV2PreTrainedModel extends PreTrainedModel { } +export class DebertaV2PreTrainedModel extends PreTrainedModel {} /** * The bare DeBERTa-V2 Model transformer outputting raw hidden-states without any specific head on top. */ -export class DebertaV2Model extends DebertaV2PreTrainedModel { } +export class DebertaV2Model extends DebertaV2PreTrainedModel {} /** * DeBERTa-V2 Model with a `language modeling` head on top. */ export class DebertaV2ForMaskedLM extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * DeBERTa-V2 Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class DebertaV2ForSequenceClassification extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * DeBERTa-V2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. */ export class DebertaV2ForTokenClassification extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** @@ -1890,123 +2103,121 @@ export class DebertaV2ForTokenClassification extends DebertaV2PreTrainedModel { * layers on top of the hidden-states output to compute `span start logits` and `span end logits`). */ export class DebertaV2ForQuestionAnswering extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // DistilBert models -export class DistilBertPreTrainedModel extends PreTrainedModel { } -export class DistilBertModel extends DistilBertPreTrainedModel { } +export class DistilBertPreTrainedModel extends PreTrainedModel {} +export class DistilBertModel extends DistilBertPreTrainedModel {} /** * DistilBertForSequenceClassification is a class representing a DistilBERT model for sequence classification. */ export class DistilBertForSequenceClassification extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * DistilBertForTokenClassification is a class representing a DistilBERT model for token classification. */ export class DistilBertForTokenClassification extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } - /** * DistilBertForQuestionAnswering is a class representing a DistilBERT model for question answering. */ export class DistilBertForQuestionAnswering extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } /** * DistilBertForMaskedLM is a class representing a DistilBERT model for masking task. */ export class DistilBertForMaskedLM extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // ESM models -export class EsmPreTrainedModel extends PreTrainedModel { } +export class EsmPreTrainedModel extends PreTrainedModel {} /** * The bare ESM Model transformer outputting raw hidden-states without any specific head on top. */ -export class EsmModel extends EsmPreTrainedModel { } +export class EsmModel extends EsmPreTrainedModel {} /** * ESM Model with a `language modeling` head on top. */ export class EsmForMaskedLM extends EsmPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class EsmForSequenceClassification extends EsmPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** @@ -2014,968 +2225,978 @@ export class EsmForSequenceClassification extends EsmPreTrainedModel { * e.g. for Named-Entity-Recognition (NER) tasks. */ export class EsmForTokenClassification extends EsmPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // MobileBert models -export class MobileBertPreTrainedModel extends PreTrainedModel { } -export class MobileBertModel extends MobileBertPreTrainedModel { } +export class MobileBertPreTrainedModel extends PreTrainedModel {} +export class MobileBertModel extends MobileBertPreTrainedModel {} /** * MobileBertForMaskedLM is a class representing a MobileBERT model for masking task. */ export class MobileBertForMaskedLM extends MobileBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class MobileBertForSequenceClassification extends MobileBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * MobileBert Model with a span classification head on top for extractive question-answering tasks */ export class MobileBertForQuestionAnswering extends MobileBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // MPNet models -export class MPNetPreTrainedModel extends PreTrainedModel { } +export class MPNetPreTrainedModel extends PreTrainedModel {} /** * The bare MPNet Model transformer outputting raw hidden-states without any specific head on top. */ -export class MPNetModel extends MPNetPreTrainedModel { } +export class MPNetModel extends MPNetPreTrainedModel {} /** * MPNetForMaskedLM is a class representing a MPNet model for masked language modeling. */ export class MPNetForMaskedLM extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * MPNetForSequenceClassification is a class representing a MPNet model for sequence classification. */ export class MPNetForSequenceClassification extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * MPNetForTokenClassification is a class representing a MPNet model for token classification. */ export class MPNetForTokenClassification extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** * MPNetForQuestionAnswering is a class representing a MPNet model for question answering. */ export class MPNetForQuestionAnswering extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // SqueezeBert models -export class SqueezeBertPreTrainedModel extends PreTrainedModel { } -export class SqueezeBertModel extends SqueezeBertPreTrainedModel { } +export class SqueezeBertPreTrainedModel extends PreTrainedModel {} +export class SqueezeBertModel extends SqueezeBertPreTrainedModel {} export class SqueezeBertForMaskedLM extends SqueezeBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } export class SqueezeBertForSequenceClassification extends SqueezeBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } export class SqueezeBertForQuestionAnswering extends SqueezeBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // Albert models -export class AlbertPreTrainedModel extends PreTrainedModel { } -export class AlbertModel extends AlbertPreTrainedModel { } +export class AlbertPreTrainedModel extends PreTrainedModel {} +export class AlbertModel extends AlbertPreTrainedModel {} export class AlbertForSequenceClassification extends AlbertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } export class AlbertForQuestionAnswering extends AlbertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } export class AlbertForMaskedLM extends AlbertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // T5 models -export class T5PreTrainedModel extends PreTrainedModel { }; +export class T5PreTrainedModel extends PreTrainedModel {} -export class T5Model extends T5PreTrainedModel { } +export class T5Model extends T5PreTrainedModel {} /** * T5Model is a class representing a T5 model for conditional generation. */ export class T5ForConditionalGeneration extends T5PreTrainedModel { + /** + * Creates a new instance of the `T5ForConditionalGeneration` class. + * @param {Object} config The model configuration. + * @param {any} session session for the model. + * @param {any} decoder_merged_session session for the decoder. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `T5ForConditionalGeneration` class. - * @param {Object} config The model configuration. - * @param {any} session session for the model. - * @param {any} decoder_merged_session session for the decoder. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; + this.num_decoder_layers = this.config.num_decoder_layers; + this.num_decoder_heads = this.config.num_heads; + this.decoder_dim_kv = this.config.d_kv; - this.num_decoder_layers = this.config.num_decoder_layers; - this.num_decoder_heads = this.config.num_heads; - this.decoder_dim_kv = this.config.d_kv; - - this.num_encoder_layers = this.config.num_layers; - this.num_encoder_heads = this.config.num_heads; - this.encoder_dim_kv = this.config.d_kv; - } + this.num_encoder_layers = this.config.num_layers; + this.num_encoder_heads = this.config.num_heads; + this.encoder_dim_kv = this.config.d_kv; + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // LONGT5 models /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class LongT5PreTrainedModel extends PreTrainedModel { }; +export class LongT5PreTrainedModel extends PreTrainedModel {} /** * The bare LONGT5 Model transformer outputting raw hidden-states without any specific head on top. */ -export class LongT5Model extends LongT5PreTrainedModel { } +export class LongT5Model extends LongT5PreTrainedModel {} /** * LONGT5 Model with a `language modeling` head on top. */ export class LongT5ForConditionalGeneration extends LongT5PreTrainedModel { - /** - * Creates a new instance of the `LongT5ForConditionalGeneration` class. - * @param {Object} config The model configuration. - * @param {any} session session for the model. - * @param {any} decoder_merged_session session for the decoder. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; + /** + * Creates a new instance of the `LongT5ForConditionalGeneration` class. + * @param {Object} config The model configuration. + * @param {any} session session for the model. + * @param {any} decoder_merged_session session for the decoder. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - this.num_decoder_layers = this.config.num_decoder_layers; - this.num_decoder_heads = this.config.num_heads; - this.decoder_dim_kv = this.config.d_kv; + this.num_decoder_layers = this.config.num_decoder_layers; + this.num_decoder_heads = this.config.num_heads; + this.decoder_dim_kv = this.config.d_kv; - this.num_encoder_layers = this.config.num_layers; - this.num_encoder_heads = this.config.num_heads; - this.encoder_dim_kv = this.config.d_kv; - } + this.num_encoder_layers = this.config.num_layers; + this.num_encoder_heads = this.config.num_heads; + this.encoder_dim_kv = this.config.d_kv; + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // MT5 models -export class MT5PreTrainedModel extends PreTrainedModel { }; +export class MT5PreTrainedModel extends PreTrainedModel {} -export class MT5Model extends MT5PreTrainedModel { } +export class MT5Model extends MT5PreTrainedModel {} /** * A class representing a conditional sequence-to-sequence model based on the MT5 architecture. */ export class MT5ForConditionalGeneration extends MT5PreTrainedModel { + /** + * Creates a new instance of the `MT5ForConditionalGeneration` class. + * @param {any} config The model configuration. + * @param {any} session The ONNX session containing the encoder weights. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `MT5ForConditionalGeneration` class. - * @param {any} config The model configuration. - * @param {any} session The ONNX session containing the encoder weights. - * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; + this.num_decoder_layers = this.config.num_decoder_layers; + this.num_decoder_heads = this.config.num_heads; + this.decoder_dim_kv = this.config.d_kv; - this.num_decoder_layers = this.config.num_decoder_layers; - this.num_decoder_heads = this.config.num_heads; - this.decoder_dim_kv = this.config.d_kv; - - this.num_encoder_layers = this.config.num_layers; - this.num_encoder_heads = this.config.num_heads; - this.encoder_dim_kv = this.config.d_kv; - } + this.num_encoder_layers = this.config.num_layers; + this.num_encoder_heads = this.config.num_heads; + this.encoder_dim_kv = this.config.d_kv; + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // Bart models -export class BartPretrainedModel extends PreTrainedModel { }; +export class BartPretrainedModel extends PreTrainedModel {} /** * The bare BART Model outputting raw hidden-states without any specific head on top. */ -export class BartModel extends BartPretrainedModel { } +export class BartModel extends BartPretrainedModel {} /** * The BART Model with a language modeling head. Can be used for summarization. */ export class BartForConditionalGeneration extends BartPretrainedModel { + /** + * Creates a new instance of the `BartForConditionalGeneration` class. + * @param {Object} config The configuration object for the Bart model. + * @param {Object} session The ONNX session used to execute the model. + * @param {Object} decoder_merged_session The ONNX session used to execute the decoder. + * @param {Object} generation_config The generation configuration object. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `BartForConditionalGeneration` class. - * @param {Object} config The configuration object for the Bart model. - * @param {Object} session The ONNX session used to execute the model. - * @param {Object} decoder_merged_session The ONNX session used to execute the decoder. - * @param {Object} generation_config The generation configuration object. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; - - this.num_decoder_layers = this.config.decoder_layers; - this.num_decoder_heads = this.config.decoder_attention_heads; - this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - - this.num_encoder_layers = this.config.encoder_layers; - this.num_encoder_heads = this.config.encoder_attention_heads; - this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; - } + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } } /** * Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) */ export class BartForSequenceClassification extends BartPretrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // MBart models -export class MBartPreTrainedModel extends PreTrainedModel { }; +export class MBartPreTrainedModel extends PreTrainedModel {} /** * The bare MBART Model outputting raw hidden-states without any specific head on top. */ -export class MBartModel extends MBartPreTrainedModel { } +export class MBartModel extends MBartPreTrainedModel {} /** * The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models. */ export class MBartForConditionalGeneration extends MBartPreTrainedModel { + /** + * Creates a new instance of the `MBartForConditionalGeneration` class. + * @param {Object} config The configuration object for the Bart model. + * @param {Object} session The ONNX session used to execute the model. + * @param {Object} decoder_merged_session The ONNX session used to execute the decoder. + * @param {Object} generation_config The generation configuration object. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `MBartForConditionalGeneration` class. - * @param {Object} config The configuration object for the Bart model. - * @param {Object} session The ONNX session used to execute the model. - * @param {Object} decoder_merged_session The ONNX session used to execute the decoder. - * @param {Object} generation_config The generation configuration object. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; - - this.num_decoder_layers = this.config.decoder_layers; - this.num_decoder_heads = this.config.decoder_attention_heads; - this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - - this.num_encoder_layers = this.config.encoder_layers; - this.num_encoder_heads = this.config.encoder_attention_heads; - this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; - } + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } } /** * MBart model with a sequence classification/head on top (a linear layer on top of the pooled output). */ export class MBartForSequenceClassification extends MBartPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } - export class MBartForCausalLM extends MBartPreTrainedModel { - /** - * Creates a new instance of the `MBartForCausalLM` class. - * @param {Object} config Configuration object for the model. - * @param {Object} decoder_merged_session ONNX Session object for the decoder. - * @param {Object} generation_config Configuration object for the generation process. - */ - constructor(config, decoder_merged_session, generation_config) { - super(config, decoder_merged_session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `MBartForCausalLM` class. + * @param {Object} config Configuration object for the model. + * @param {Object} decoder_merged_session ONNX Session object for the decoder. + * @param {Object} generation_config Configuration object for the generation process. + */ + constructor(config, decoder_merged_session, generation_config) { + super(config, decoder_merged_session); + this.generation_config = generation_config; - this.num_decoder_layers = this.config.decoder_layers; - this.num_decoder_heads = this.config.decoder_attention_heads; - this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - this.num_encoder_layers = this.config.encoder_layers; - this.num_encoder_heads = this.config.encoder_attention_heads; - this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; - } + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // Blenderbot models -export class BlenderbotPreTrainedModel extends PreTrainedModel { }; +export class BlenderbotPreTrainedModel extends PreTrainedModel {} /** * The bare Blenderbot Model outputting raw hidden-states without any specific head on top. */ -export class BlenderbotModel extends BlenderbotPreTrainedModel { } +export class BlenderbotModel extends BlenderbotPreTrainedModel {} /** * The Blenderbot Model with a language modeling head. Can be used for summarization. */ export class BlenderbotForConditionalGeneration extends BlenderbotPreTrainedModel { + /** + * Creates a new instance of the `BlenderbotForConditionalGeneration` class. + * @param {any} config The model configuration. + * @param {any} session The ONNX session containing the encoder weights. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `BlenderbotForConditionalGeneration` class. - * @param {any} config The model configuration. - * @param {any} session The ONNX session containing the encoder weights. - * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - this.num_decoder_layers = this.config.decoder_layers; - this.num_decoder_heads = this.config.decoder_attention_heads; - this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - - this.num_encoder_layers = this.config.encoder_layers; - this.num_encoder_heads = this.config.encoder_attention_heads; - this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; - } + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // Blenderbot models -export class BlenderbotSmallPreTrainedModel extends PreTrainedModel { }; +export class BlenderbotSmallPreTrainedModel extends PreTrainedModel {} /** * The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top. */ -export class BlenderbotSmallModel extends BlenderbotSmallPreTrainedModel { } +export class BlenderbotSmallModel extends BlenderbotSmallPreTrainedModel {} /** * The BlenderbotSmall Model with a language modeling head. Can be used for summarization. */ export class BlenderbotSmallForConditionalGeneration extends BlenderbotSmallPreTrainedModel { + /** + * Creates a new instance of the `BlenderbotForConditionalGeneration` class. + * @param {any} config The model configuration. + * @param {any} session The ONNX session containing the encoder weights. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `BlenderbotForConditionalGeneration` class. - * @param {any} config The model configuration. - * @param {any} session The ONNX session containing the encoder weights. - * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - this.num_decoder_layers = this.config.decoder_layers; - this.num_decoder_heads = this.config.decoder_attention_heads; - this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - - this.num_encoder_layers = this.config.encoder_layers; - this.num_encoder_heads = this.config.encoder_attention_heads; - this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; - } + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // Roberta models -export class RobertaPreTrainedModel extends PreTrainedModel { } -export class RobertaModel extends RobertaPreTrainedModel { } +export class RobertaPreTrainedModel extends PreTrainedModel {} +export class RobertaModel extends RobertaPreTrainedModel {} /** * RobertaForMaskedLM class for performing masked language modeling on Roberta models. */ export class RobertaForMaskedLM extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * RobertaForSequenceClassification class for performing sequence classification on Roberta models. */ export class RobertaForSequenceClassification extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * RobertaForTokenClassification class for performing token classification on Roberta models. */ export class RobertaForTokenClassification extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** * RobertaForQuestionAnswering class for performing question answering on Roberta models. */ export class RobertaForQuestionAnswering extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // XLM models /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class XLMPreTrainedModel extends PreTrainedModel { } +export class XLMPreTrainedModel extends PreTrainedModel {} /** * The bare XLM Model transformer outputting raw hidden-states without any specific head on top. */ -export class XLMModel extends XLMPreTrainedModel { } +export class XLMModel extends XLMPreTrainedModel {} /** * The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ export class XLMWithLMHeadModel extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class XLMForSequenceClassification extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) */ export class XLMForTokenClassification extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** * XLM Model with a span classification head on top for extractive question-answering tasks */ export class XLMForQuestionAnswering extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // XLMRoberta models -export class XLMRobertaPreTrainedModel extends PreTrainedModel { } -export class XLMRobertaModel extends XLMRobertaPreTrainedModel { } +export class XLMRobertaPreTrainedModel extends PreTrainedModel {} +export class XLMRobertaModel extends XLMRobertaPreTrainedModel {} /** * XLMRobertaForMaskedLM class for performing masked language modeling on XLMRoberta models. */ export class XLMRobertaForMaskedLM extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } } /** * XLMRobertaForSequenceClassification class for performing sequence classification on XLMRoberta models. */ export class XLMRobertaForSequenceClassification extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } /** * XLMRobertaForTokenClassification class for performing token classification on XLMRoberta models. */ export class XLMRobertaForTokenClassification extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } } /** * XLMRobertaForQuestionAnswering class for performing question answering on XLMRoberta models. */ export class XLMRobertaForQuestionAnswering extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // Audio Spectrogram Transformer (AST) models -export class ASTPreTrainedModel extends PreTrainedModel { }; +export class ASTPreTrainedModel extends PreTrainedModel {} /** * The bare AST Model transformer outputting raw hidden-states without any specific head on top. */ -export class ASTModel extends ASTPreTrainedModel { } +export class ASTModel extends ASTPreTrainedModel {} /** * Audio Spectrogram Transformer model with an audio classification head on top * (a linear layer on top of the pooled output) e.g. for datasets like AudioSet, Speech Commands v2. */ -export class ASTForAudioClassification extends ASTPreTrainedModel { } +export class ASTForAudioClassification extends ASTPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// // Whisper models -export class WhisperPreTrainedModel extends PreTrainedModel { }; +export class WhisperPreTrainedModel extends PreTrainedModel {} /** * WhisperModel class for training Whisper models without a language model head. */ -export class WhisperModel extends WhisperPreTrainedModel { } +export class WhisperModel extends WhisperPreTrainedModel {} /** * WhisperForConditionalGeneration class for generating conditional outputs from Whisper models. */ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { + requires_attention_mask = false; + main_input_name = "input_features"; - requires_attention_mask = false; - main_input_name = 'input_features'; + /** + * Creates a new instance of the `WhisperForConditionalGeneration` class. + * @param {Object} config Configuration object for the model. + * @param {Object} session ONNX Session object for the model. + * @param {Object} decoder_merged_session ONNX Session object for the decoder. + * @param {Object} generation_config Configuration object for the generation process. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `WhisperForConditionalGeneration` class. - * @param {Object} config Configuration object for the model. - * @param {Object} session ONNX Session object for the model. - * @param {Object} decoder_merged_session ONNX Session object for the decoder. - * @param {Object} generation_config Configuration object for the generation process. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - this.num_decoder_layers = this.config.decoder_layers; - this.num_decoder_heads = this.config.decoder_attention_heads; - this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } - this.num_encoder_layers = this.config.encoder_layers; - this.num_encoder_heads = this.config.encoder_attention_heads; - this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + /** + * @typedef {Object} WhisperGenerationConfig + * @extends GenerationConfig + * @property {boolean} [return_timestamps=null] Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`. + * @property {boolean} [return_token_timestamps=null] Whether to return token-level timestamps + * with the text. This can be used with or without the `return_timestamps` option. To get word-level + * timestamps, use the tokenizer to group the tokens into words. + * @property {number} [num_frames=null] The number of audio frames available in this chunk. This is only used generating word-level timestamps. + */ + + /** + * Generates outputs based on input and generation configuration. + * @param {Object} inputs Input data for the model. + * @param {WhisperGenerationConfig} generation_config Configuration object for the generation process. + * @param {Object} logits_processor Optional logits processor object. + * @returns {Promise} Promise object represents the generated outputs. + */ + async generate( + inputs, + generation_config = null, + logits_processor = null, + // { + // return_timestamps = null, + // return_token_timestamps = null, + // language = null, + // task = null, + // } = {}, + ) { + // Create generation config object + generation_config = this._get_generation_config(generation_config); + + // Whisper has additional options for returning timestamps + generation_config.return_timestamps ??= false; + + // TODO add language and task + + if (generation_config.return_timestamps) { + logits_processor = [ + new WhisperTimeStampLogitsProcessor(generation_config), + ]; } - /** - * @typedef {Object} WhisperGenerationConfig - * @extends GenerationConfig - * @property {boolean} [return_timestamps=null] Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`. - * @property {boolean} [return_token_timestamps=null] Whether to return token-level timestamps - * with the text. This can be used with or without the `return_timestamps` option. To get word-level - * timestamps, use the tokenizer to group the tokens into words. - * @property {number} [num_frames=null] The number of audio frames available in this chunk. This is only used generating word-level timestamps. - */ + if (generation_config.return_token_timestamps) { + generation_config.output_attentions = true; + generation_config.return_dict_in_generate = true; - /** - * Generates outputs based on input and generation configuration. - * @param {Object} inputs Input data for the model. - * @param {WhisperGenerationConfig} generation_config Configuration object for the generation process. - * @param {Object} logits_processor Optional logits processor object. - * @returns {Promise} Promise object represents the generated outputs. - */ - async generate( - inputs, - generation_config = null, - logits_processor = null, - // { - // return_timestamps = null, - // return_token_timestamps = null, - // language = null, - // task = null, - // } = {}, - ) { - // Create generation config object - generation_config = this._get_generation_config(generation_config); - - - // Whisper has additional options for returning timestamps - generation_config.return_timestamps ??= false; - - // TODO add language and task - - if (generation_config.return_timestamps) { - logits_processor = [new WhisperTimeStampLogitsProcessor(generation_config)] - } - - if (generation_config.return_token_timestamps) { - generation_config.output_attentions = true; - generation_config.return_dict_in_generate = true; - - if (generation_config.task === 'translate') { - console.warn("Token-level timestamps may not be reliable for task 'translate'.") - } - - if (!generation_config.alignment_heads) { - throw new Error( - "Model generation config has no `alignment_heads`, token-level timestamps not available. " + - "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config." - ) - } - } - - const outputs = await super.generate(inputs, generation_config, logits_processor); - - if (generation_config.return_token_timestamps && generation_config.alignment_heads) { - outputs["token_timestamps"] = this._extract_token_timestamps( - outputs, - generation_config.alignment_heads, - generation_config.num_frames, - ) - } - - return outputs - } - - /** - * Calculates token-level timestamps using the encoder-decoder cross-attentions and - * dynamic time-warping (DTW) to map each output token to a position in the input audio. - * @param {Object} generate_outputs Outputs generated by the model - * @param {Tensor[][][]} generate_outputs.cross_attentions The cross attentions output by the model - * @param {Tensor[][][]} generate_outputs.decoder_attentions The decoder attentions output by the model - * @param {number[][]} generate_outputs.sequences The sequences output by the model - * @param {number[][]} alignment_heads Alignment heads of the model - * @param {number} [num_frames=null] Number of frames in the input audio. - * @param {number} [time_precision=0.02] Precision of the timestamps in seconds - * @returns {Tensor} tensor containing the timestamps in seconds for each predicted token - */ - _extract_token_timestamps(generate_outputs, alignment_heads, num_frames = null, time_precision = 0.02) { - if (!generate_outputs.cross_attentions) { - throw new Error( - "Model outputs must contain cross attentions to extract timestamps. " + - "This is most likely because the model was not exported with `output_attentions=True`." - ) - } - - let median_filter_width = this.config.median_filter_width; - if (median_filter_width === undefined) { - console.warn("Model config has no `median_filter_width`, using default value of 7.") - median_filter_width = 7; - } - - const batchedMatrices = generate_outputs.cross_attentions.map(batch => { - // Create a list with `decoder_layers` elements, each a tensor of shape - // (batch size, attention_heads, output length, input length). - let cross_attentions = Array.from({ length: this.config.decoder_layers }, - (_, i) => cat(batch.map(x => x[i]), 2) - ); - - let weights = stack(alignment_heads.map(([l, h]) => { - return num_frames - ? cross_attentions[l].slice(null, h, null, [0, num_frames]) - : cross_attentions[l].slice(null, h); - })); - weights = weights.transpose(1, 0, 2, 3) - - let [std, calculatedMean] = std_mean(weights, -2, 0, true); - - // Normalize and smoothen the weights. - let smoothedWeights = weights.clone(); // [1, 8, seqLength, 1500] - - for (let a = 0; a < smoothedWeights.dims[0]; ++a) { - let aTensor = smoothedWeights[a]; // [8, seqLength, 1500] - - for (let b = 0; b < aTensor.dims[0]; ++b) { - let bTensor = aTensor[b]; // [seqLength, 1500] - - const stdTensor = std[a][b][0]; // [1500] - const meanTensor = calculatedMean[a][b][0]; // [1500] - - for (let c = 0; c < bTensor.dims[0]; ++c) { - - let cTensor = bTensor[c]; // [1500] - for (let d = 0; d < cTensor.data.length; ++d) { - cTensor.data[d] = (cTensor.data[d] - meanTensor.data[d]) / stdTensor.data[d] - } - - // Apply median filter. - // cTensor.data.set(medianFilter(cTensor.data, median_filter_width)) - } - } - } - - // Average the different cross-attention heads. - const matrix = mean(smoothedWeights, 1); - return matrix; - }); - - const timestampsShape = [generate_outputs.sequences.length, generate_outputs.sequences[0].length]; - - const timestamps = new Tensor( - 'float32', - new Float32Array(timestampsShape[0] * timestampsShape[1]), - timestampsShape + if (generation_config.task === "translate") { + console.warn( + "Token-level timestamps may not be reliable for task 'translate'.", ); + } - // Perform dynamic time warping on each element of the batch. - for (let batch_idx = 0; batch_idx < timestampsShape[0]; ++batch_idx) { - // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions - // as the python implementation - const matrix = batchedMatrices[batch_idx].neg().squeeze_(0); - let [text_indices, time_indices] = dynamicTimeWarping(matrix); - - let diffs = Array.from({ length: text_indices.length - 1 }, (v, i) => text_indices[i + 1] - text_indices[i]); - let jumps = mergeArrays([1], diffs).map(x => !!x); // convert to boolean - - let jump_times = []; - for (let i = 0; i < jumps.length; ++i) { - if (jumps[i]) { - jump_times.push(time_indices[i] * time_precision); - // NOTE: No point in rounding here, since we set to Float32Array later - } - } - timestamps[batch_idx].data.set(jump_times, 1) - } - - return timestamps; + if (!generation_config.alignment_heads) { + throw new Error( + "Model generation config has no `alignment_heads`, token-level timestamps not available. " + + "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.", + ); + } } + + const outputs = await super.generate( + inputs, + generation_config, + logits_processor, + ); + + if ( + generation_config.return_token_timestamps && + generation_config.alignment_heads + ) { + outputs["token_timestamps"] = this._extract_token_timestamps( + outputs, + generation_config.alignment_heads, + generation_config.num_frames, + ); + } + + return outputs; + } + + /** + * Calculates token-level timestamps using the encoder-decoder cross-attentions and + * dynamic time-warping (DTW) to map each output token to a position in the input audio. + * @param {Object} generate_outputs Outputs generated by the model + * @param {Tensor[][][]} generate_outputs.cross_attentions The cross attentions output by the model + * @param {Tensor[][][]} generate_outputs.decoder_attentions The decoder attentions output by the model + * @param {number[][]} generate_outputs.sequences The sequences output by the model + * @param {number[][]} alignment_heads Alignment heads of the model + * @param {number} [num_frames=null] Number of frames in the input audio. + * @param {number} [time_precision=0.02] Precision of the timestamps in seconds + * @returns {Tensor} tensor containing the timestamps in seconds for each predicted token + */ + _extract_token_timestamps( + generate_outputs, + alignment_heads, + num_frames = null, + time_precision = 0.02, + ) { + if (!generate_outputs.cross_attentions) { + throw new Error( + "Model outputs must contain cross attentions to extract timestamps. " + + "This is most likely because the model was not exported with `output_attentions=True`.", + ); + } + + let median_filter_width = this.config.median_filter_width; + if (median_filter_width === undefined) { + console.warn( + "Model config has no `median_filter_width`, using default value of 7.", + ); + median_filter_width = 7; + } + + const batchedMatrices = generate_outputs.cross_attentions.map((batch) => { + // Create a list with `decoder_layers` elements, each a tensor of shape + // (batch size, attention_heads, output length, input length). + let cross_attentions = Array.from( + { length: this.config.decoder_layers }, + (_, i) => + cat( + batch.map((x) => x[i]), + 2, + ), + ); + + let weights = stack( + alignment_heads.map(([l, h]) => { + return num_frames + ? cross_attentions[l].slice(null, h, null, [0, num_frames]) + : cross_attentions[l].slice(null, h); + }), + ); + weights = weights.transpose(1, 0, 2, 3); + + let [std, calculatedMean] = std_mean(weights, -2, 0, true); + + // Normalize and smoothen the weights. + let smoothedWeights = weights.clone(); // [1, 8, seqLength, 1500] + + for (let a = 0; a < smoothedWeights.dims[0]; ++a) { + let aTensor = smoothedWeights[a]; // [8, seqLength, 1500] + + for (let b = 0; b < aTensor.dims[0]; ++b) { + let bTensor = aTensor[b]; // [seqLength, 1500] + + const stdTensor = std[a][b][0]; // [1500] + const meanTensor = calculatedMean[a][b][0]; // [1500] + + for (let c = 0; c < bTensor.dims[0]; ++c) { + let cTensor = bTensor[c]; // [1500] + for (let d = 0; d < cTensor.data.length; ++d) { + cTensor.data[d] = + (cTensor.data[d] - meanTensor.data[d]) / stdTensor.data[d]; + } + + // Apply median filter. + // cTensor.data.set(medianFilter(cTensor.data, median_filter_width)) + } + } + } + + // Average the different cross-attention heads. + const matrix = mean(smoothedWeights, 1); + return matrix; + }); + + const timestampsShape = [ + generate_outputs.sequences.length, + generate_outputs.sequences[0].length, + ]; + + const timestamps = new Tensor( + "float32", + new Float32Array(timestampsShape[0] * timestampsShape[1]), + timestampsShape, + ); + + // Perform dynamic time warping on each element of the batch. + for (let batch_idx = 0; batch_idx < timestampsShape[0]; ++batch_idx) { + // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions + // as the python implementation + const matrix = batchedMatrices[batch_idx].neg().squeeze_(0); + let [text_indices, time_indices] = dynamicTimeWarping(matrix); + + let diffs = Array.from( + { length: text_indices.length - 1 }, + (v, i) => text_indices[i + 1] - text_indices[i], + ); + let jumps = mergeArrays([1], diffs).map((x) => !!x); // convert to boolean + + let jump_times = []; + for (let i = 0; i < jumps.length; ++i) { + if (jumps[i]) { + jump_times.push(time_indices[i] * time_precision); + // NOTE: No point in rounding here, since we set to Float32Array later + } + } + timestamps[batch_idx].data.set(jump_times, 1); + } + + return timestamps; + } } ////////////////////////////////////////////////// @@ -2984,90 +3205,99 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks */ export class VisionEncoderDecoderModel extends PreTrainedModel { - main_input_name = 'pixel_values'; + main_input_name = "pixel_values"; - /** - * Creates a new instance of the `VisionEncoderDecoderModel` class. - * @param {Object} config The configuration object specifying the hyperparameters and other model settings. - * @param {Object} session The ONNX session containing the encoder model. - * @param {any} decoder_merged_session The ONNX session containing the merged decoder model. - * @param {Object} generation_config Configuration object for the generation process. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; + /** + * Creates a new instance of the `VisionEncoderDecoderModel` class. + * @param {Object} config The configuration object specifying the hyperparameters and other model settings. + * @param {Object} session The ONNX session containing the encoder model. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder model. + * @param {Object} generation_config Configuration object for the generation process. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - // Extract configs - const encoderConfig = this.config.encoder; - const decoderConfig = this.config.decoder; + // Extract configs + const encoderConfig = this.config.encoder; + const decoderConfig = this.config.decoder; - // Validate encoder - const encoderModelType = encoderConfig.model_type; - const encoderModel = - MODEL_MAPPING_NAMES_ENCODER_ONLY.get(encoderModelType) - ?? MODEL_MAPPING_NAMES_ENCODER_DECODER.get(encoderModelType); - if (!encoderModel) { - console.warn(`Model type for encoder '${encoderModelType}' not found, assuming encoder-only architecture. Please report this at https://github.com/xenova/transformers.js/issues/new/choose.`); - } - - // Validate decoder - const decoderModel = MODEL_WITH_LM_HEAD_MAPPING_NAMES.get(decoderConfig.model_type); - if (!decoderModel) { - throw new Error(`Unable to construct \`VisionEncoderDecoder\` due to unsupported decoder: "${this.config.decoder.model_type}"`); - } - - // @ts-ignore - const decoderModelClass = decoderModel[1]; - // @ts-ignore - const decoder = new decoderModelClass(decoderConfig, decoder_merged_session, generation_config); - - this.add_encoder_pkv = 'num_decoder_layers' in decoder; - if (this.add_encoder_pkv) { - // Decoder is part of an encoder-decoder model - this.num_decoder_layers = decoder.num_decoder_layers; - this.num_decoder_heads = decoder.num_decoder_heads; - this.decoder_dim_kv = decoder.decoder_dim_kv; - - this.num_encoder_layers = decoder.num_encoder_layers; - this.num_encoder_heads = decoder.num_encoder_heads; - this.encoder_dim_kv = decoder.encoder_dim_kv; - - } else { - // Decoder is a decoder-only model - this.num_layers = decoder.num_layers; - this.num_heads = decoder.num_heads; - this.dim_kv = decoder.dim_kv; - } + // Validate encoder + const encoderModelType = encoderConfig.model_type; + const encoderModel = + MODEL_MAPPING_NAMES_ENCODER_ONLY.get(encoderModelType) ?? + MODEL_MAPPING_NAMES_ENCODER_DECODER.get(encoderModelType); + if (!encoderModel) { + console.warn( + `Model type for encoder '${encoderModelType}' not found, assuming encoder-only architecture. Please report this at https://github.com/xenova/transformers.js/issues/new/choose.`, + ); } + + // Validate decoder + const decoderModel = MODEL_WITH_LM_HEAD_MAPPING_NAMES.get( + decoderConfig.model_type, + ); + if (!decoderModel) { + throw new Error( + `Unable to construct \`VisionEncoderDecoder\` due to unsupported decoder: "${this.config.decoder.model_type}"`, + ); + } + + // @ts-ignore + const decoderModelClass = decoderModel[1]; + // @ts-ignore + const decoder = new decoderModelClass( + decoderConfig, + decoder_merged_session, + generation_config, + ); + + this.add_encoder_pkv = "num_decoder_layers" in decoder; + if (this.add_encoder_pkv) { + // Decoder is part of an encoder-decoder model + this.num_decoder_layers = decoder.num_decoder_layers; + this.num_decoder_heads = decoder.num_decoder_heads; + this.decoder_dim_kv = decoder.decoder_dim_kv; + + this.num_encoder_layers = decoder.num_encoder_layers; + this.num_encoder_heads = decoder.num_encoder_heads; + this.encoder_dim_kv = decoder.encoder_dim_kv; + } else { + // Decoder is a decoder-only model + this.num_layers = decoder.num_layers; + this.num_heads = decoder.num_heads; + this.dim_kv = decoder.dim_kv; + } + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // CLIP models -export class CLIPPreTrainedModel extends PreTrainedModel { } +export class CLIPPreTrainedModel extends PreTrainedModel {} /** * CLIP Text and Vision Model with a projection layers on top - * + * * **Example:** Perform zero-shot image classification with a `CLIPModel`. - * + * * ```javascript * import { AutoTokenizer, AutoProcessor, CLIPModel, RawImage } from '@xenova/transformers'; - * + * * // Load tokenizer, processor, and model * let tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16'); * let processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16'); * let model = await CLIPModel.from_pretrained('Xenova/clip-vit-base-patch16'); - * + * * // Run tokenization * let texts = ['a photo of a car', 'a photo of a football match'] * let text_inputs = tokenizer(texts, { padding: true, truncation: true }); - * + * * // Read image and run processor * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); * let image_inputs = await processor(image); - * + * * // Run model with both text and pixel inputs * let output = await model({ ...text_inputs, ...image_inputs }); * // { @@ -3090,24 +3320,24 @@ export class CLIPPreTrainedModel extends PreTrainedModel { } * // } * ``` */ -export class CLIPModel extends CLIPPreTrainedModel { } +export class CLIPModel extends CLIPPreTrainedModel {} /** * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output) - * + * * **Example:** Compute text embeddings with `CLIPTextModelWithProjection`. - * + * * ```javascript * import { AutoTokenizer, CLIPTextModelWithProjection } from '@xenova/transformers'; - * + * * // Load tokenizer and text model * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16'); * const text_model = await CLIPTextModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16'); - * + * * // Run tokenization * let texts = ['a photo of a car', 'a photo of a football match']; * let text_inputs = tokenizer(texts, { padding: true, truncation: true }); - * + * * // Compute embeddings * const { text_embeds } = await text_model(text_inputs); * // Tensor { @@ -3119,31 +3349,30 @@ export class CLIPModel extends CLIPPreTrainedModel { } * ``` */ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { - - /** @type {PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'text_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); - } + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + // Update default model file name if not provided + options.model_file_name ??= "text_model"; + return super.from_pretrained(pretrained_model_name_or_path, options); + } } /** * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output) - * + * * **Example:** Compute vision embeddings with `CLIPVisionModelWithProjection`. - * + * * ```javascript * import { AutoProcessor, CLIPVisionModelWithProjection, RawImage} from '@xenova/transformers'; - * + * * // Load processor and vision model * const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16'); * const vision_model = await CLIPVisionModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16'); - * + * * // Read image and run processor * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); * let image_inputs = await processor(image); - * + * * // Compute embeddings * const { image_embeds } = await vision_model(image_inputs); * // Tensor { @@ -3155,41 +3384,40 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { * ``` */ export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel { - /** @type {PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'vision_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); - } + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + // Update default model file name if not provided + options.model_file_name ??= "vision_model"; + return super.from_pretrained(pretrained_model_name_or_path, options); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // SigLIP models -export class SiglipPreTrainedModel extends PreTrainedModel { } +export class SiglipPreTrainedModel extends PreTrainedModel {} /** * SigLIP Text and Vision Model with a projection layers on top - * + * * **Example:** Perform zero-shot image classification with a `SiglipModel`. - * + * * ```javascript * import { AutoTokenizer, AutoProcessor, SiglipModel, RawImage } from '@xenova/transformers'; - * + * * // Load tokenizer, processor, and model * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224'); * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224'); * const model = await SiglipModel.from_pretrained('Xenova/siglip-base-patch16-224'); - * + * * // Run tokenization * const texts = ['a photo of 2 cats', 'a photo of 2 dogs']; * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true }); - * + * * // Read image and run processor * const image = await RawImage.read('http://images.cocodataset.org/val2017/000000039769.jpg'); * const image_inputs = await processor(image); - * + * * // Run model with both text and pixel inputs * const output = await model({ ...text_inputs, ...image_inputs }); * // { @@ -3212,24 +3440,24 @@ export class SiglipPreTrainedModel extends PreTrainedModel { } * // } * ``` */ -export class SiglipModel extends SiglipPreTrainedModel { } +export class SiglipModel extends SiglipPreTrainedModel {} /** * The text model from SigLIP without any head or projection on top. - * + * * **Example:** Compute text embeddings with `SiglipTextModel`. - * + * * ```javascript * import { AutoTokenizer, SiglipTextModel } from '@xenova/transformers'; - * + * * // Load tokenizer and text model * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224'); * const text_model = await SiglipTextModel.from_pretrained('Xenova/siglip-base-patch16-224'); - * + * * // Run tokenization * const texts = ['a photo of 2 cats', 'a photo of 2 dogs']; * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true }); - * + * * // Compute embeddings * const { pooler_output } = await text_model(text_inputs); * // Tensor { @@ -3241,31 +3469,30 @@ export class SiglipModel extends SiglipPreTrainedModel { } * ``` */ export class SiglipTextModel extends SiglipPreTrainedModel { - - /** @type {PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'text_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); - } + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + // Update default model file name if not provided + options.model_file_name ??= "text_model"; + return super.from_pretrained(pretrained_model_name_or_path, options); + } } /** * The vision model from SigLIP without any head or projection on top. - * + * * **Example:** Compute vision embeddings with `SiglipVisionModel`. - * + * * ```javascript * import { AutoProcessor, SiglipVisionModel, RawImage} from '@xenova/transformers'; - * + * * // Load processor and vision model * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224'); * const vision_model = await SiglipVisionModel.from_pretrained('Xenova/siglip-base-patch16-224'); - * + * * // Read image and run processor * const image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); * const image_inputs = await processor(image); - * + * * // Compute embeddings * const { pooler_output } = await vision_model(image_inputs); * // Tensor { @@ -3277,48 +3504,47 @@ export class SiglipTextModel extends SiglipPreTrainedModel { * ``` */ export class SiglipVisionModel extends CLIPPreTrainedModel { - /** @type {PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'vision_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); - } + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + // Update default model file name if not provided + options.model_file_name ??= "vision_model"; + return super.from_pretrained(pretrained_model_name_or_path, options); + } } ////////////////////////////////////////////////// // ChineseCLIP models -export class ChineseCLIPPreTrainedModel extends PreTrainedModel { } +export class ChineseCLIPPreTrainedModel extends PreTrainedModel {} -export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel { } +export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // CLIPSeg models -export class CLIPSegPreTrainedModel extends PreTrainedModel { } +export class CLIPSegPreTrainedModel extends PreTrainedModel {} -export class CLIPSegModel extends CLIPSegPreTrainedModel { } +export class CLIPSegModel extends CLIPSegPreTrainedModel {} /** * CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation. - * + * * **Example:** Perform zero-shot image segmentation with a `CLIPSegForImageSegmentation` model. - * + * * ```javascript * import { AutoTokenizer, AutoProcessor, CLIPSegForImageSegmentation, RawImage } from '@xenova/transformers'; - * + * * // Load tokenizer, processor, and model * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clipseg-rd64-refined'); * const processor = await AutoProcessor.from_pretrained('Xenova/clipseg-rd64-refined'); * const model = await CLIPSegForImageSegmentation.from_pretrained('Xenova/clipseg-rd64-refined'); - * + * * // Run tokenization * const texts = ['a glass', 'something to fill', 'wood', 'a jar']; * const text_inputs = tokenizer(texts, { padding: true, truncation: true }); - * + * * // Read image and run processor * const image = await RawImage.read('https://github.com/timojl/clipseg/blob/master/example_image.jpg?raw=true'); * const image_inputs = await processor(image); - * + * * // Run model with both text and pixel inputs * const { logits } = await model({ ...text_inputs, ...image_inputs }); * // logits: Tensor { @@ -3328,7 +3554,7 @@ export class CLIPSegModel extends CLIPSegPreTrainedModel { } * // size: 495616 * // } * ``` - * + * * You can visualize the predictions as follows: * ```javascript * const preds = logits @@ -3337,45 +3563,44 @@ export class CLIPSegModel extends CLIPSegPreTrainedModel { } * .mul_(255) * .round_() * .to('uint8'); - * + * * for (let i = 0; i < preds.dims[0]; ++i) { * const img = RawImage.fromTensor(preds[i]); * img.save(`prediction_${i}.png`); * } * ``` */ -export class CLIPSegForImageSegmentation extends CLIPSegPreTrainedModel { } +export class CLIPSegForImageSegmentation extends CLIPSegPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // GPT2 models export class GPT2PreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPT2PreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `GPT2PreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.n_head - this.num_layers = this.config.n_layer - this.dim_kv = this.config.n_embd / this.num_heads; - } + this.num_heads = this.config.n_head; + this.num_layers = this.config.n_layer; + this.dim_kv = this.config.n_embd / this.num_heads; + } } -export class GPT2Model extends GPT2PreTrainedModel { } +export class GPT2Model extends GPT2PreTrainedModel {} /** * GPT-2 language model head on top of the GPT-2 base model. This model is suitable for text generation tasks. */ -export class GPT2LMHeadModel extends GPT2PreTrainedModel { } +export class GPT2LMHeadModel extends GPT2PreTrainedModel {} // export class GPT2ForSequenceClassification extends GPT2PreTrainedModel { // TODO // } @@ -3384,144 +3609,141 @@ export class GPT2LMHeadModel extends GPT2PreTrainedModel { } ////////////////////////////////////////////////// // GPTNeo models export class GPTNeoPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPTNeoPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `GPTNeoPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.num_heads; - this.num_layers = this.config.num_layers; - this.dim_kv = this.config.hidden_size / this.num_heads; - } + this.num_heads = this.config.num_heads; + this.num_layers = this.config.num_layers; + this.dim_kv = this.config.hidden_size / this.num_heads; + } } -export class GPTNeoModel extends GPTNeoPreTrainedModel { } +export class GPTNeoModel extends GPTNeoPreTrainedModel {} -export class GPTNeoForCausalLM extends GPTNeoPreTrainedModel { } +export class GPTNeoForCausalLM extends GPTNeoPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// // GPTNeoX models export class GPTNeoXPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPTNeoXPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `GPTNeoXPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.num_attention_heads; - this.num_layers = this.config.num_hidden_layers; - this.dim_kv = this.config.hidden_size / this.num_heads; - } + this.num_heads = this.config.num_attention_heads; + this.num_layers = this.config.num_hidden_layers; + this.dim_kv = this.config.hidden_size / this.num_heads; + } } -export class GPTNeoXModel extends GPTNeoXPreTrainedModel { } +export class GPTNeoXModel extends GPTNeoXPreTrainedModel {} -export class GPTNeoXForCausalLM extends GPTNeoXPreTrainedModel { } +export class GPTNeoXForCausalLM extends GPTNeoXPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // GPT-J models export class GPTJPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPTJPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `GPTJPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.n_head - this.num_layers = this.config.n_layer - this.dim_kv = this.config.n_embd / this.num_heads; - } + this.num_heads = this.config.n_head; + this.num_layers = this.config.n_layer; + this.dim_kv = this.config.n_embd / this.num_heads; + } } -export class GPTJModel extends GPTJPreTrainedModel { } +export class GPTJModel extends GPTJPreTrainedModel {} -export class GPTJForCausalLM extends GPTJPreTrainedModel { } +export class GPTJForCausalLM extends GPTJPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // GPTBigCode models export class GPTBigCodePreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPTBigCodePreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `GPTBigCodePreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.n_head - this.num_layers = this.config.n_layer - this.dim_kv = this.config.n_embd / this.num_heads; - } + this.num_heads = this.config.n_head; + this.num_layers = this.config.n_layer; + this.dim_kv = this.config.n_embd / this.num_heads; + } } -export class GPTBigCodeModel extends GPTBigCodePreTrainedModel { } +export class GPTBigCodeModel extends GPTBigCodePreTrainedModel {} -export class GPTBigCodeForCausalLM extends GPTBigCodePreTrainedModel { } +export class GPTBigCodeForCausalLM extends GPTBigCodePreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// // CodeGen models export class CodeGenPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `CodeGenPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `CodeGenPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.n_head - this.num_layers = this.config.n_layer - this.dim_kv = this.config.n_embd / this.num_heads; - } + this.num_heads = this.config.n_head; + this.num_layers = this.config.n_layer; + this.dim_kv = this.config.n_embd / this.num_heads; + } } /** * CodeGenModel is a class representing a code generation model without a language model head. */ -export class CodeGenModel extends CodeGenPreTrainedModel { } +export class CodeGenModel extends CodeGenPreTrainedModel {} /** * CodeGenForCausalLM is a class that represents a code generation model based on the GPT-2 architecture. It extends the `CodeGenPreTrainedModel` class. */ -export class CodeGenForCausalLM extends CodeGenPreTrainedModel { } +export class CodeGenForCausalLM extends CodeGenPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // LLama models @@ -3529,200 +3751,199 @@ export class CodeGenForCausalLM extends CodeGenPreTrainedModel { } * The bare LLama Model outputting raw hidden-states without any specific head on top. */ export class LlamaPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `LlamaPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `LlamaPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.num_key_value_heads ?? this.config.num_attention_heads - this.num_layers = this.config.num_hidden_layers - this.dim_kv = this.config.hidden_size / this.config.num_attention_heads - } + this.num_heads = + this.config.num_key_value_heads ?? this.config.num_attention_heads; + this.num_layers = this.config.num_hidden_layers; + this.dim_kv = this.config.hidden_size / this.config.num_attention_heads; + } } /** * The bare LLaMA Model outputting raw hidden-states without any specific head on top. */ -export class LlamaModel extends LlamaPreTrainedModel { } +export class LlamaModel extends LlamaPreTrainedModel {} -export class LlamaForCausalLM extends LlamaPreTrainedModel { } +export class LlamaForCausalLM extends LlamaPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// // Phi models export class PhiPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `PhiPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `PhiPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id; + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.num_attention_heads; - this.num_layers = this.config.num_hidden_layers; - this.dim_kv = this.config.hidden_size / this.num_heads; - } + this.num_heads = this.config.num_attention_heads; + this.num_layers = this.config.num_hidden_layers; + this.dim_kv = this.config.hidden_size / this.num_heads; + } } /** * The bare Phi Model outputting raw hidden-states without any specific head on top. */ -export class PhiModel extends PhiPreTrainedModel { } +export class PhiModel extends PhiPreTrainedModel {} -export class PhiForCausalLM extends PhiPreTrainedModel { } +export class PhiForCausalLM extends PhiPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // Bloom models /** * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ export class BloomPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `BloomPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `BloomPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.n_head - this.num_layers = this.config.n_layer - this.dim_kv = this.config.hidden_size / this.num_heads; - } + this.num_heads = this.config.n_head; + this.num_layers = this.config.n_layer; + this.dim_kv = this.config.hidden_size / this.num_heads; + } } /** * The bare Bloom Model transformer outputting raw hidden-states without any specific head on top. */ -export class BloomModel extends BloomPreTrainedModel { } +export class BloomModel extends BloomPreTrainedModel {} /** * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ -export class BloomForCausalLM extends BloomPreTrainedModel { } +export class BloomForCausalLM extends BloomPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// // MPT models export class MptPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `MptPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `MptPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.n_heads - this.num_layers = this.config.n_layers - this.dim_kv = this.config.d_model / this.num_heads; - } + this.num_heads = this.config.n_heads; + this.num_layers = this.config.n_layers; + this.dim_kv = this.config.d_model / this.num_heads; + } } /** * The bare Mpt Model transformer outputting raw hidden-states without any specific head on top. */ -export class MptModel extends MptPreTrainedModel { } +export class MptModel extends MptPreTrainedModel {} /** * The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ -export class MptForCausalLM extends MptPreTrainedModel { } +export class MptForCausalLM extends MptPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // OPT models export class OPTPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `OPTPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `OPTPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.num_attention_heads; - this.num_layers = this.config.num_hidden_layers; - this.dim_kv = this.config.hidden_size / this.num_heads; - } + this.num_heads = this.config.num_attention_heads; + this.num_layers = this.config.num_hidden_layers; + this.dim_kv = this.config.hidden_size / this.num_heads; + } } /** * The bare OPT Model outputting raw hidden-states without any specific head on top. */ -export class OPTModel extends OPTPreTrainedModel { } +export class OPTModel extends OPTPreTrainedModel {} /** * The OPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ -export class OPTForCausalLM extends OPTPreTrainedModel { } +export class OPTForCausalLM extends OPTPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class ViTPreTrainedModel extends PreTrainedModel { } -export class ViTModel extends ViTPreTrainedModel { } +export class ViTPreTrainedModel extends PreTrainedModel {} +export class ViTModel extends ViTPreTrainedModel {} export class ViTForImageClassification extends ViTPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class VitMattePreTrainedModel extends PreTrainedModel { } +export class VitMattePreTrainedModel extends PreTrainedModel {} /** * ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes. - * + * * **Example:** Perform image matting with a `VitMatteForImageMatting` model. * ```javascript * import { AutoProcessor, VitMatteForImageMatting, RawImage } from '@xenova/transformers'; - * + * * // Load processor and model * const processor = await AutoProcessor.from_pretrained('Xenova/vitmatte-small-distinctions-646'); * const model = await VitMatteForImageMatting.from_pretrained('Xenova/vitmatte-small-distinctions-646'); - * + * * // Load image and trimap * const image = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_image.png'); * const trimap = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_trimap.png'); - * + * * // Prepare image + trimap for the model * const inputs = await processor(image, trimap); - * + * * // Predict alpha matte * const { alphas } = await model(inputs); * // Tensor { @@ -3732,18 +3953,18 @@ export class VitMattePreTrainedModel extends PreTrainedModel { } * // data: Float32Array(614400) [ 0.9894027709960938, 0.9970508813858032, ... ] * // } * ``` - * + * * You can visualize the alpha matte as follows: * ```javascript * import { Tensor, cat } from '@xenova/transformers'; - * + * * // Visualize predicted alpha matte * const imageTensor = new Tensor( * 'uint8', * new Uint8Array(image.data), * [image.height, image.width, image.channels] * ).transpose(2, 0, 1); - * + * * // Convert float (0-1) alpha matte to uint8 (0-255) * const alphaChannel = alphas * .squeeze(0) @@ -3751,220 +3972,218 @@ export class VitMattePreTrainedModel extends PreTrainedModel { } * .clamp_(0, 255) * .round_() * .to('uint8'); - * + * * // Concatenate original image with predicted alpha * const imageData = cat([imageTensor, alphaChannel], 0); - * + * * // Save output image * const outputImage = RawImage.fromTensor(imageData); * outputImage.save('output.png'); * ``` */ export class VitMatteForImageMatting extends VitMattePreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new ImageMattingOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new ImageMattingOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class MobileViTPreTrainedModel extends PreTrainedModel { } -export class MobileViTModel extends MobileViTPreTrainedModel { } +export class MobileViTPreTrainedModel extends PreTrainedModel {} +export class MobileViTModel extends MobileViTPreTrainedModel {} export class MobileViTForImageClassification extends MobileViTPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } // TODO: MobileViTForSemanticSegmentation ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class OwlViTPreTrainedModel extends PreTrainedModel { } -export class OwlViTModel extends OwlViTPreTrainedModel { } -export class OwlViTForObjectDetection extends OwlViTPreTrainedModel { } +export class OwlViTPreTrainedModel extends PreTrainedModel {} +export class OwlViTModel extends OwlViTPreTrainedModel {} +export class OwlViTForObjectDetection extends OwlViTPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// // Beit Models -export class BeitPreTrainedModel extends PreTrainedModel { } -export class BeitModel extends BeitPreTrainedModel { } +export class BeitPreTrainedModel extends PreTrainedModel {} +export class BeitModel extends BeitPreTrainedModel {} export class BeitForImageClassification extends BeitPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// -export class DetrPreTrainedModel extends PreTrainedModel { } -export class DetrModel extends DetrPreTrainedModel { } +export class DetrPreTrainedModel extends PreTrainedModel {} +export class DetrModel extends DetrPreTrainedModel {} export class DetrForObjectDetection extends DetrPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new DetrObjectDetectionOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new DetrObjectDetectionOutput(await super._call(model_inputs)); + } } export class DetrForSegmentation extends DetrPreTrainedModel { - /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Model inputs - * @returns {Promise} Object containing segmentation outputs - */ - async _call(model_inputs) { - return new DetrSegmentationOutput(await super._call(model_inputs)); - } + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Model inputs + * @returns {Promise} Object containing segmentation outputs + */ + async _call(model_inputs) { + return new DetrSegmentationOutput(await super._call(model_inputs)); + } } export class DetrObjectDetectionOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification logits (including no-object) for all queries. - * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). - * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). - */ - constructor({ logits, pred_boxes }) { - super(); - this.logits = logits; - this.pred_boxes = pred_boxes; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification logits (including no-object) for all queries. + * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). + * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). + */ + constructor({ logits, pred_boxes }) { + super(); + this.logits = logits; + this.pred_boxes = pred_boxes; + } } export class DetrSegmentationOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits The output logits of the model. - * @param {Tensor} output.pred_boxes Predicted boxes. - * @param {Tensor} output.pred_masks Predicted masks. - */ - constructor({ logits, pred_boxes, pred_masks }) { - super(); - this.logits = logits; - this.pred_boxes = pred_boxes; - this.pred_masks = pred_masks; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits The output logits of the model. + * @param {Tensor} output.pred_boxes Predicted boxes. + * @param {Tensor} output.pred_masks Predicted masks. + */ + constructor({ logits, pred_boxes, pred_masks }) { + super(); + this.logits = logits; + this.pred_boxes = pred_boxes; + this.pred_masks = pred_masks; + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class TableTransformerPreTrainedModel extends PreTrainedModel { } +export class TableTransformerPreTrainedModel extends PreTrainedModel {} /** * The bare Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) * outputting raw hidden-states without any specific head on top. */ -export class TableTransformerModel extends TableTransformerPreTrainedModel { } +export class TableTransformerModel extends TableTransformerPreTrainedModel {} /** * Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) * with object detection heads on top, for tasks such as COCO detection. */ export class TableTransformerForObjectDetection extends TableTransformerPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new TableTransformerObjectDetectionOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new TableTransformerObjectDetectionOutput( + await super._call(model_inputs), + ); + } } -export class TableTransformerObjectDetectionOutput extends DetrObjectDetectionOutput { } +export class TableTransformerObjectDetectionOutput extends DetrObjectDetectionOutput {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// -export class DeiTPreTrainedModel extends PreTrainedModel { } -export class DeiTModel extends DeiTPreTrainedModel { } +export class DeiTPreTrainedModel extends PreTrainedModel {} +export class DeiTModel extends DeiTPreTrainedModel {} export class DeiTForImageClassification extends DeiTPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class ResNetPreTrainedModel extends PreTrainedModel { } +export class ResNetPreTrainedModel extends PreTrainedModel {} /** * The bare ResNet model outputting raw features without any specific head on top. */ -export class ResNetModel extends ResNetPreTrainedModel { } +export class ResNetModel extends ResNetPreTrainedModel {} /** * ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. */ export class ResNetForImageClassification extends ResNetPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// -export class SwinPreTrainedModel extends PreTrainedModel { } -export class SwinModel extends SwinPreTrainedModel { } +export class SwinPreTrainedModel extends PreTrainedModel {} +export class SwinModel extends SwinPreTrainedModel {} export class SwinForImageClassification extends SwinPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class Swin2SRPreTrainedModel extends PreTrainedModel { } +export class Swin2SRPreTrainedModel extends PreTrainedModel {} /** * The bare Swin2SR Model transformer outputting raw hidden-states without any specific head on top. */ -export class Swin2SRModel extends Swin2SRPreTrainedModel { } +export class Swin2SRModel extends Swin2SRPreTrainedModel {} /** * Swin2SR Model transformer with an upsampler head on top for image super resolution and restoration. - * + * * **Example:** Super-resolution w/ `Xenova/swin2SR-classical-sr-x2-64`. - * + * * ```javascript * import { AutoProcessor, Swin2SRForImageSuperResolution, RawImage } from '@xenova/transformers'; - * + * * // Load processor and model * const model_id = 'Xenova/swin2SR-classical-sr-x2-64'; * const processor = await AutoProcessor.from_pretrained(model_id); * const model = await Swin2SRForImageSuperResolution.from_pretrained(model_id); - * + * * // Prepare model inputs * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/butterfly.jpg'; * const image = await RawImage.fromURL(url); * const inputs = await processor(image); - * + * * // Run model * const outputs = await model(inputs); - * + * * // Convert Tensor to RawImage * const output = outputs.reconstruction.squeeze().clamp_(0, 1).mul_(255).round_().to('uint8'); * const outputImage = RawImage.fromTensor(output); @@ -3976,42 +4195,42 @@ export class Swin2SRModel extends Swin2SRPreTrainedModel { } * // } * ``` */ -export class Swin2SRForImageSuperResolution extends Swin2SRPreTrainedModel { } +export class Swin2SRForImageSuperResolution extends Swin2SRPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class DPTPreTrainedModel extends PreTrainedModel { } +export class DPTPreTrainedModel extends PreTrainedModel {} /** * The bare DPT Model transformer outputting raw hidden-states without any specific head on top. */ -export class DPTModel extends DPTPreTrainedModel { } +export class DPTModel extends DPTPreTrainedModel {} /** * DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. - * + * * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`. * ```javascript * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@xenova/transformers'; - * + * * // Load model and processor * const model_id = 'Xenova/dpt-hybrid-midas'; * const model = await DPTForDepthEstimation.from_pretrained(model_id); * const processor = await AutoProcessor.from_pretrained(model_id); - * + * * // Load image from URL * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg'; * const image = await RawImage.fromURL(url); - * + * * // Prepare image for the model * const inputs = await processor(image); - * + * * // Run model * const { predicted_depth } = await model(inputs); - * + * * // Interpolate to original size * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false); - * + * * // Visualize the prediction * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8'); * const depth = RawImage.fromTensor(formatted); @@ -4023,42 +4242,42 @@ export class DPTModel extends DPTPreTrainedModel { } * // } * ``` */ -export class DPTForDepthEstimation extends DPTPreTrainedModel { } +export class DPTForDepthEstimation extends DPTPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class GLPNPreTrainedModel extends PreTrainedModel { } +export class GLPNPreTrainedModel extends PreTrainedModel {} /** * The bare GLPN encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top. */ -export class GLPNModel extends GLPNPreTrainedModel { } +export class GLPNModel extends GLPNPreTrainedModel {} /** * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2. - * + * * **Example:** Depth estimation w/ `Xenova/glpn-kitti`. * ```javascript * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@xenova/transformers'; - * + * * // Load model and processor * const model_id = 'Xenova/glpn-kitti'; * const model = await GLPNForDepthEstimation.from_pretrained(model_id); * const processor = await AutoProcessor.from_pretrained(model_id); - * + * * // Load image from URL * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg'; * const image = await RawImage.fromURL(url); - * + * * // Prepare image for the model * const inputs = await processor(image); - * + * * // Run model * const { predicted_depth } = await model(inputs); - * + * * // Interpolate to original size * const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false); - * + * * // Visualize the prediction * const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8'); * const depth = RawImage.fromTensor(formatted); @@ -4070,64 +4289,64 @@ export class GLPNModel extends GLPNPreTrainedModel { } * // } * ``` */ -export class GLPNForDepthEstimation extends GLPNPreTrainedModel { } +export class GLPNForDepthEstimation extends GLPNPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class DonutSwinPreTrainedModel extends PreTrainedModel { } +export class DonutSwinPreTrainedModel extends PreTrainedModel {} /** * The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top. - * + * * **Example:** Step-by-step Document Parsing. - * + * * ```javascript * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@xenova/transformers'; - * + * * // Choose model to use * const model_id = 'Xenova/donut-base-finetuned-cord-v2'; - * + * * // Prepare image inputs * const processor = await AutoProcessor.from_pretrained(model_id); * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/receipt.png'; * const image = await RawImage.read(url); * const image_inputs = await processor(image); - * + * * // Prepare decoder inputs * const tokenizer = await AutoTokenizer.from_pretrained(model_id); * const task_prompt = ''; * const decoder_input_ids = tokenizer(task_prompt, { * add_special_tokens: false, * }).input_ids; - * + * * // Create the model * const model = await AutoModelForVision2Seq.from_pretrained(model_id); - * + * * // Run inference * const output = await model.generate(image_inputs.pixel_values, { * decoder_input_ids, * max_length: model.config.decoder.max_position_embeddings, * }); - * + * * // Decode output * const decoded = tokenizer.batch_decode(output)[0]; * // CINNAMON SUGAR 17,000 1 x 17,000 17,000 17,000 20,000 3,000 * ``` - * + * * **Example:** Step-by-step Document Visual Question Answering (DocVQA) - * + * * ```javascript * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@xenova/transformers'; - * + * * // Choose model to use * const model_id = 'Xenova/donut-base-finetuned-docvqa'; - * + * * // Prepare image inputs * const processor = await AutoProcessor.from_pretrained(model_id); * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/invoice.png'; * const image = await RawImage.read(url); * const image_inputs = await processor(image); - * + * * // Prepare decoder inputs * const tokenizer = await AutoTokenizer.from_pretrained(model_id); * const question = 'What is the invoice number?'; @@ -4135,139 +4354,135 @@ export class DonutSwinPreTrainedModel extends PreTrainedModel { } * const decoder_input_ids = tokenizer(task_prompt, { * add_special_tokens: false, * }).input_ids; - * + * * // Create the model * const model = await AutoModelForVision2Seq.from_pretrained(model_id); - * + * * // Run inference * const output = await model.generate(image_inputs.pixel_values, { * decoder_input_ids, * max_length: model.config.decoder.max_position_embeddings, * }); - * + * * // Decode output * const decoded = tokenizer.batch_decode(output)[0]; * // What is the invoice number? us-001 * ``` */ -export class DonutSwinModel extends DonutSwinPreTrainedModel { } +export class DonutSwinModel extends DonutSwinPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// -export class ConvNextPreTrainedModel extends PreTrainedModel { } +export class ConvNextPreTrainedModel extends PreTrainedModel {} /** * The bare ConvNext model outputting raw features without any specific head on top. */ -export class ConvNextModel extends ConvNextPreTrainedModel { } +export class ConvNextModel extends ConvNextPreTrainedModel {} /** * ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. */ export class ConvNextForImageClassification extends ConvNextPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// -export class ConvNextV2PreTrainedModel extends PreTrainedModel { } +export class ConvNextV2PreTrainedModel extends PreTrainedModel {} /** * The bare ConvNextV2 model outputting raw features without any specific head on top. */ -export class ConvNextV2Model extends ConvNextV2PreTrainedModel { } +export class ConvNextV2Model extends ConvNextV2PreTrainedModel {} /** * ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. */ export class ConvNextV2ForImageClassification extends ConvNextV2PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// -export class Dinov2PreTrainedModel extends PreTrainedModel { } +export class Dinov2PreTrainedModel extends PreTrainedModel {} /** * The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top. */ -export class Dinov2Model extends Dinov2PreTrainedModel { } +export class Dinov2Model extends Dinov2PreTrainedModel {} /** * Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet. */ export class Dinov2ForImageClassification extends Dinov2PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// -export class YolosPreTrainedModel extends PreTrainedModel { } -export class YolosModel extends YolosPreTrainedModel { } +export class YolosPreTrainedModel extends PreTrainedModel {} +export class YolosModel extends YolosPreTrainedModel {} export class YolosForObjectDetection extends YolosPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new YolosObjectDetectionOutput(await super._call(model_inputs)); - } + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new YolosObjectDetectionOutput(await super._call(model_inputs)); + } } export class YolosObjectDetectionOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification logits (including no-object) for all queries. - * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). - * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). - */ - constructor({ logits, pred_boxes }) { - super(); - this.logits = logits; - this.pred_boxes = pred_boxes; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification logits (including no-object) for all queries. + * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). + * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). + */ + constructor({ logits, pred_boxes }) { + super(); + this.logits = logits; + this.pred_boxes = pred_boxes; + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// -export class SamPreTrainedModel extends PreTrainedModel { } +export class SamPreTrainedModel extends PreTrainedModel {} /** * Segment Anything Model (SAM) for generating segmentation masks, given an input image * and optional 2D location and bounding boxes. - * + * * **Example:** Perform mask generation w/ `Xenova/sam-vit-base`. * ```javascript * import { SamModel, AutoProcessor, RawImage } from '@xenova/transformers'; - * + * * const model = await SamModel.from_pretrained('Xenova/sam-vit-base'); * const processor = await AutoProcessor.from_pretrained('Xenova/sam-vit-base'); - * + * * const img_url = 'https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png'; * const raw_image = await RawImage.read(img_url); * const input_points = [[[450, 600]]] // 2D localization of a window - * + * * const inputs = await processor(raw_image, input_points); * const outputs = await model(inputs); - * + * * const masks = await processor.post_process_masks(outputs.pred_masks, inputs.original_sizes, inputs.reshaped_input_sizes); * // [ * // Tensor { @@ -4291,193 +4506,191 @@ export class SamPreTrainedModel extends PreTrainedModel { } * ``` */ export class SamModel extends SamPreTrainedModel { - /** - * Creates a new instance of the `SamModel` class. - * @param {Object} config The configuration object specifying the hyperparameters and other model settings. - * @param {Object} vision_encoder The ONNX session containing the vision encoder model. - * @param {any} prompt_encoder_mask_decoder The ONNX session containing the prompt encoder and mask decoder model. - */ - constructor(config, vision_encoder, prompt_encoder_mask_decoder) { - super(config, vision_encoder); - this.prompt_encoder_mask_decoder = prompt_encoder_mask_decoder; + /** + * Creates a new instance of the `SamModel` class. + * @param {Object} config The configuration object specifying the hyperparameters and other model settings. + * @param {Object} vision_encoder The ONNX session containing the vision encoder model. + * @param {any} prompt_encoder_mask_decoder The ONNX session containing the prompt encoder and mask decoder model. + */ + constructor(config, vision_encoder, prompt_encoder_mask_decoder) { + super(config, vision_encoder); + this.prompt_encoder_mask_decoder = prompt_encoder_mask_decoder; + } + + /** + * Compute image embeddings and positional image embeddings, given the pixel values of an image. + * @param {Object} model_inputs Object containing the model inputs. + * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `SamProcessor`. + * @returns {Promise<{ image_embeddings: Tensor, image_positional_embeddings: Tensor }>} The image embeddings and positional image embeddings. + */ + async get_image_embeddings({ pixel_values }) { + // in: + // - pixel_values: tensor.float32[batch_size,3,1024,1024] + // + // out: + // - image_embeddings: tensor.float32[batch_size,256,64,64] + // - image_positional_embeddings: tensor.float32[batch_size,256,64,64] + return await encoderForward(this, { pixel_values }); + } + + /** + * @typedef {Object} SamModelInputs Object containing the model inputs. + * @property {Tensor} pixel_values Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`. + * These can be obtained using a `SamProcessor`. + * @property {Tensor} input_points Input 2D spatial points with shape `(batch_size, num_points, 2)`. + * This is used by the prompt encoder to encode the prompt. + * @property {Tensor} [input_labels] Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`. + * This is used by the prompt encoder to encode the prompt. There are 4 types of labels: + * - `1`: the point is a point that contains the object of interest + * - `0`: the point is a point that does not contain the object of interest + * - `-1`: the point corresponds to the background + * - `-10`: the point is a padding point, thus should be ignored by the prompt encoder + * @property {Tensor} [image_embeddings] Image embeddings used by the mask decoder. + * @property {Tensor} [image_positional_embeddings] Image positional embeddings used by the mask decoder. + */ + + /** + * @param {SamModelInputs} model_inputs Object containing the model inputs. + * @returns {Promise} The output of the model. + */ + async forward(model_inputs) { + if ( + !model_inputs.image_embeddings || + !model_inputs.image_positional_embeddings + ) { + // Compute the image embeddings if they are missing + model_inputs = { + ...model_inputs, + ...(await this.get_image_embeddings(model_inputs)), + }; } - /** - * Compute image embeddings and positional image embeddings, given the pixel values of an image. - * @param {Object} model_inputs Object containing the model inputs. - * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `SamProcessor`. - * @returns {Promise<{ image_embeddings: Tensor, image_positional_embeddings: Tensor }>} The image embeddings and positional image embeddings. - */ - async get_image_embeddings({ pixel_values }) { - // in: - // - pixel_values: tensor.float32[batch_size,3,1024,1024] - // - // out: - // - image_embeddings: tensor.float32[batch_size,256,64,64] - // - image_positional_embeddings: tensor.float32[batch_size,256,64,64] - return await encoderForward(this, { pixel_values }) + if (!model_inputs.input_labels) { + // Set default input labels if they are missing + const shape = model_inputs.input_points.dims.slice(0, -1); + const numElements = shape.reduce((a, b) => a * b, 1); + model_inputs.input_labels = new Tensor( + "int64", + new BigInt64Array(numElements).fill(1n), + shape, + ); } - /** - * @typedef {Object} SamModelInputs Object containing the model inputs. - * @property {Tensor} pixel_values Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`. - * These can be obtained using a `SamProcessor`. - * @property {Tensor} input_points Input 2D spatial points with shape `(batch_size, num_points, 2)`. - * This is used by the prompt encoder to encode the prompt. - * @property {Tensor} [input_labels] Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`. - * This is used by the prompt encoder to encode the prompt. There are 4 types of labels: - * - `1`: the point is a point that contains the object of interest - * - `0`: the point is a point that does not contain the object of interest - * - `-1`: the point corresponds to the background - * - `-10`: the point is a padding point, thus should be ignored by the prompt encoder - * @property {Tensor} [image_embeddings] Image embeddings used by the mask decoder. - * @property {Tensor} [image_positional_embeddings] Image positional embeddings used by the mask decoder. - */ + // Returns: + // - iou_scores: tensor.float32[batch_size,point_batch_size,3] + // - pred_masks: tensor.float32[batch_size,point_batch_size,3,256,256] + return await sessionRun(this.prompt_encoder_mask_decoder, { + input_points: model_inputs.input_points, + input_labels: model_inputs.input_labels, + image_embeddings: model_inputs.image_embeddings, + image_positional_embeddings: model_inputs.image_positional_embeddings, + }); + } - /** - * @param {SamModelInputs} model_inputs Object containing the model inputs. - * @returns {Promise} The output of the model. - */ - async forward(model_inputs) { - if (!model_inputs.image_embeddings || !model_inputs.image_positional_embeddings) { - // Compute the image embeddings if they are missing - model_inputs = { - ...model_inputs, - ...(await this.get_image_embeddings(model_inputs)) - } - } - - if (!model_inputs.input_labels) { - // Set default input labels if they are missing - const shape = model_inputs.input_points.dims.slice(0, -1); - const numElements = shape.reduce((a, b) => a * b, 1); - model_inputs.input_labels = new Tensor( - 'int64', - new BigInt64Array(numElements).fill(1n), - shape - ); - } - - // Returns: - // - iou_scores: tensor.float32[batch_size,point_batch_size,3] - // - pred_masks: tensor.float32[batch_size,point_batch_size,3,256,256] - return await sessionRun(this.prompt_encoder_mask_decoder, { - input_points: model_inputs.input_points, - input_labels: model_inputs.input_labels, - image_embeddings: model_inputs.image_embeddings, - image_positional_embeddings: model_inputs.image_positional_embeddings, - }); - } - - /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Model inputs - * @returns {Promise} Object containing segmentation outputs - */ - async _call(model_inputs) { - return new SamImageSegmentationOutput(await super._call(model_inputs)); - } + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Model inputs + * @returns {Promise} Object containing segmentation outputs + */ + async _call(model_inputs) { + return new SamImageSegmentationOutput(await super._call(model_inputs)); + } } - /** * Base class for Segment-Anything model's output. */ export class SamImageSegmentationOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.iou_scores The output logits of the model. - * @param {Tensor} output.pred_masks Predicted boxes. - */ - constructor({ iou_scores, pred_masks }) { - super(); - this.iou_scores = iou_scores; - this.pred_masks = pred_masks; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.iou_scores The output logits of the model. + * @param {Tensor} output.pred_masks Predicted boxes. + */ + constructor({ iou_scores, pred_masks }) { + super(); + this.iou_scores = iou_scores; + this.pred_masks = pred_masks; + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // MarianMT models -export class MarianPreTrainedModel extends PreTrainedModel { }; +export class MarianPreTrainedModel extends PreTrainedModel {} -export class MarianModel extends MarianPreTrainedModel { } +export class MarianModel extends MarianPreTrainedModel {} export class MarianMTModel extends MarianPreTrainedModel { + /** + * Creates a new instance of the `MarianMTModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {any} decoder_merged_session + * @param {any} generation_config + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `MarianMTModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {any} decoder_merged_session - * @param {any} generation_config - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - this.num_decoder_layers = this.config.decoder_layers; - this.num_decoder_heads = this.config.decoder_attention_heads; - this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - - this.num_encoder_layers = this.config.encoder_layers; - this.num_encoder_heads = this.config.encoder_attention_heads; - this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; - } + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // M2M100 models -export class M2M100PreTrainedModel extends PreTrainedModel { }; +export class M2M100PreTrainedModel extends PreTrainedModel {} -export class M2M100Model extends M2M100PreTrainedModel { } +export class M2M100Model extends M2M100PreTrainedModel {} export class M2M100ForConditionalGeneration extends M2M100PreTrainedModel { + /** + * Creates a new instance of the `M2M100ForConditionalGeneration` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {any} decoder_merged_session + * @param {any} generation_config + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `M2M100ForConditionalGeneration` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {any} decoder_merged_session - * @param {any} generation_config - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; - - this.num_decoder_layers = this.config.decoder_layers; - this.num_decoder_heads = this.config.decoder_attention_heads; - this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - - this.num_encoder_layers = this.config.encoder_layers; - this.num_encoder_heads = this.config.encoder_attention_heads; - this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; - } + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // Wav2Vec2 models -export class Wav2Vec2PreTrainedModel extends PreTrainedModel { }; +export class Wav2Vec2PreTrainedModel extends PreTrainedModel {} /** * The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top. - * + * * **Example:** Load and run a `Wav2Vec2Model` for feature extraction. - * + * * ```javascript * import { AutoProcessor, AutoModel, read_audio } from '@xenova/transformers'; - * + * * // Read and preprocess audio * const processor = await AutoProcessor.from_pretrained('Xenova/mms-300m'); * const audio = await read_audio('https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac', 16000); * const inputs = await processor(audio); - * + * * // Run model with inputs * const model = await AutoModel.from_pretrained('Xenova/mms-300m'); * const output = await model(inputs); @@ -4491,48 +4704,48 @@ export class Wav2Vec2PreTrainedModel extends PreTrainedModel { }; * // } * ``` */ -export class Wav2Vec2Model extends Wav2Vec2PreTrainedModel { } +export class Wav2Vec2Model extends Wav2Vec2PreTrainedModel {} export class Wav2Vec2ForCTC extends Wav2Vec2PreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } } export class Wav2Vec2ForSequenceClassification extends Wav2Vec2PreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // Hubert models -export class HubertPreTrainedModel extends PreTrainedModel { } +export class HubertPreTrainedModel extends PreTrainedModel {} /** * The bare Hubert Model transformer outputting raw hidden-states without any specific head on top. - * + * * **Example:** Load and run a `HubertModel` for feature extraction. - * + * * ```javascript * import { AutoProcessor, AutoModel, read_audio } from '@xenova/transformers'; - * + * * // Read and preprocess audio * const processor = await AutoProcessor.from_pretrained('Xenova/hubert-base-ls960'); * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000); * const inputs = await processor(audio); - * + * * // Load and run model with inputs * const model = await AutoModel.from_pretrained('Xenova/hubert-base-ls960'); * const output = await model(inputs); @@ -4546,34 +4759,34 @@ export class HubertPreTrainedModel extends PreTrainedModel { } * // } * ``` */ -export class HubertModel extends Wav2Vec2PreTrainedModel { } +export class HubertModel extends Wav2Vec2PreTrainedModel {} /** * Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). */ export class HubertForCTC extends Wav2Vec2PreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } } /** * Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB Keyword Spotting. */ export class HubertForSequenceClassification extends Wav2Vec2PreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// @@ -4582,21 +4795,21 @@ export class HubertForSequenceClassification extends Wav2Vec2PreTrainedModel { /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class WavLMPreTrainedModel extends PreTrainedModel { }; +export class WavLMPreTrainedModel extends PreTrainedModel {} /** * The bare WavLM Model transformer outputting raw hidden-states without any specific head on top. - * + * * **Example:** Load and run a `WavLMModel` for feature extraction. - * + * * ```javascript * import { AutoProcessor, AutoModel, read_audio } from '@xenova/transformers'; - * + * * // Read and preprocess audio * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base'); * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000); * const inputs = await processor(audio); - * + * * // Run model with inputs * const model = await AutoModel.from_pretrained('Xenova/wavlm-base'); * const output = await model(inputs); @@ -4610,34 +4823,34 @@ export class WavLMPreTrainedModel extends PreTrainedModel { }; * // } * ``` */ -export class WavLMModel extends WavLMPreTrainedModel { } +export class WavLMModel extends WavLMPreTrainedModel {} /** * WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). */ export class WavLMForCTC extends WavLMPreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } } /** * WavLM Model with a sequence classification head on top (a linear layer over the pooled output). */ export class WavLMForSequenceClassification extends WavLMPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// @@ -4645,29 +4858,29 @@ export class WavLMForSequenceClassification extends WavLMPreTrainedModel { /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class SpeechT5PreTrainedModel extends PreTrainedModel { }; +export class SpeechT5PreTrainedModel extends PreTrainedModel {} /** * The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets. */ -export class SpeechT5Model extends SpeechT5PreTrainedModel { }; +export class SpeechT5Model extends SpeechT5PreTrainedModel {} /** * SpeechT5 Model with a speech encoder and a text decoder. - * + * * **Example:** Generate speech from text with `SpeechT5ForSpeechToText`. * ```javascript * import { AutoTokenizer, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, Tensor } from '@xenova/transformers'; - * + * * // Load the tokenizer and processor * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/speecht5_tts'); * const processor = await AutoProcessor.from_pretrained('Xenova/speecht5_tts'); - * + * * // Load the models * // NOTE: We use the unquantized versions as they are more accurate * const model = await SpeechT5ForTextToSpeech.from_pretrained('Xenova/speecht5_tts', { quantized: false }); * const vocoder = await SpeechT5HifiGan.from_pretrained('Xenova/speecht5_hifigan', { quantized: false }); - * + * * // Load speaker embeddings from URL * const speaker_embeddings_data = new Float32Array( * await (await fetch('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin')).arrayBuffer() @@ -4677,10 +4890,10 @@ export class SpeechT5Model extends SpeechT5PreTrainedModel { }; * speaker_embeddings_data, * [1, speaker_embeddings_data.length] * ) - * + * * // Run tokenization * const { input_ids } = tokenizer('Hello, my dog is cute'); - * + * * // Generate waveform * const { waveform } = await model.generate_speech(input_ids, speaker_embeddings, { vocoder }); * console.log(waveform) @@ -4692,197 +4905,208 @@ export class SpeechT5Model extends SpeechT5PreTrainedModel { }; * // } * ``` */ -export class SpeechT5ForSpeechToText extends SpeechT5PreTrainedModel { } +export class SpeechT5ForSpeechToText extends SpeechT5PreTrainedModel {} /** * SpeechT5 Model with a text encoder and a speech decoder. */ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel { + /** + * Creates a new instance of the `SpeechT5ForTextToSpeech` class. + * @param {Object} config The model configuration. + * @param {any} session session for the model. + * @param {any} decoder_merged_session session for the decoder. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; - /** - * Creates a new instance of the `SpeechT5ForTextToSpeech` class. - * @param {Object} config The model configuration. - * @param {any} session session for the model. - * @param {any} decoder_merged_session session for the decoder. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, decoder_merged_session, generation_config) { - super(config, session); - this.decoder_merged_session = decoder_merged_session; - this.generation_config = generation_config; + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.hidden_size / this.num_decoder_heads; - this.num_decoder_layers = this.config.decoder_layers; - this.num_decoder_heads = this.config.decoder_attention_heads; - this.decoder_dim_kv = this.config.hidden_size / this.num_decoder_heads; + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.hidden_size / this.num_encoder_heads; + } - this.num_encoder_layers = this.config.encoder_layers; - this.num_encoder_heads = this.config.encoder_attention_heads; - this.encoder_dim_kv = this.config.hidden_size / this.num_encoder_heads; + /** + * @typedef {Object} SpeechOutput + * @property {Tensor} [spectrogram] The predicted log-mel spectrogram of shape + * `(output_sequence_length, config.num_mel_bins)`. Returned when no `vocoder` is provided + * @property {Tensor} [waveform] The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided. + * @property {Tensor} [cross_attentions] The outputs of the decoder's cross-attention layers of shape + * `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length, input_sequence_length)`. returned when `output_cross_attentions` is `true`. + */ + + /** + * Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a speech waveform using a vocoder. + * @param {Tensor} input_values Indices of input sequence tokens in the vocabulary. + * @param {Tensor} speaker_embeddings Tensor containing the speaker embeddings. + * @param {Object} options Optional parameters for generating speech. + * @param {number} [options.threshold=0.5] The generated sequence ends when the predicted stop token probability exceeds this value. + * @param {number} [options.minlenratio=0.0] Used to calculate the minimum required length for the output sequence. + * @param {number} [options.maxlenratio=20.0] Used to calculate the maximum allowed length for the output sequence. + * @param {Object} [options.vocoder=null] The vocoder that converts the mel spectrogram into a speech waveform. If `null`, the output is the mel spectrogram. + * @param {boolean} [options.output_cross_attentions=false] Whether or not to return the attentions tensors of the decoder's cross-attention layers. + * @returns {Promise} A promise which resolves to an object containing the spectrogram, waveform, and cross-attention tensors. + */ + async generate_speech( + input_values, + speaker_embeddings, + { + threshold = 0.5, + minlenratio = 0.0, + maxlenratio = 20.0, + vocoder = null, + // output_cross_attentions = false, // TODO add + } = {}, + ) { + const model_inputs = { + input_ids: input_values, + }; + + const { encoder_outputs, encoder_attention_mask } = await encoderForward( + this, + model_inputs, + ); + + const r = encoder_outputs.dims[1] / this.config.reduction_factor; + const maxlen = Math.floor(r * maxlenratio); + const minlen = Math.floor(r * minlenratio); + + const num_mel_bins = this.config.num_mel_bins; + + let spectrogramParts = []; + let past_key_values = null; + let decoder_outputs = null; + let idx = 0; + + while (true) { + ++idx; + + const use_cache_branch = boolTensor(!!decoder_outputs); + let output_sequence; + if (decoder_outputs) { + output_sequence = decoder_outputs.output_sequence_out; + } else { + output_sequence = new Tensor( + "float32", + new Float32Array(num_mel_bins), + [1, 1, num_mel_bins], + ); + } + let decoderFeeds = { + use_cache_branch, + output_sequence, + encoder_attention_mask: encoder_attention_mask, + speaker_embeddings: speaker_embeddings, + encoder_hidden_states: encoder_outputs, + }; + + this.addPastKeyValues(decoderFeeds, past_key_values); + decoder_outputs = await sessionRun( + this.decoder_merged_session, + decoderFeeds, + ); + past_key_values = this.getPastKeyValues(decoder_outputs, past_key_values); + + const { prob, spectrum } = decoder_outputs; + spectrogramParts.push(spectrum); + + if ( + idx >= minlen && + // Finished when stop token or maximum length is reached. + (Array.from(prob.data).filter((p) => p >= threshold).length > 0 || + idx >= maxlen) + ) { + break; + } } - /** - * @typedef {Object} SpeechOutput - * @property {Tensor} [spectrogram] The predicted log-mel spectrogram of shape - * `(output_sequence_length, config.num_mel_bins)`. Returned when no `vocoder` is provided - * @property {Tensor} [waveform] The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided. - * @property {Tensor} [cross_attentions] The outputs of the decoder's cross-attention layers of shape - * `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length, input_sequence_length)`. returned when `output_cross_attentions` is `true`. - */ + const spectrogram = cat(spectrogramParts); + const { waveform } = await sessionRun(vocoder.session, { spectrogram }); - /** - * Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a speech waveform using a vocoder. - * @param {Tensor} input_values Indices of input sequence tokens in the vocabulary. - * @param {Tensor} speaker_embeddings Tensor containing the speaker embeddings. - * @param {Object} options Optional parameters for generating speech. - * @param {number} [options.threshold=0.5] The generated sequence ends when the predicted stop token probability exceeds this value. - * @param {number} [options.minlenratio=0.0] Used to calculate the minimum required length for the output sequence. - * @param {number} [options.maxlenratio=20.0] Used to calculate the maximum allowed length for the output sequence. - * @param {Object} [options.vocoder=null] The vocoder that converts the mel spectrogram into a speech waveform. If `null`, the output is the mel spectrogram. - * @param {boolean} [options.output_cross_attentions=false] Whether or not to return the attentions tensors of the decoder's cross-attention layers. - * @returns {Promise} A promise which resolves to an object containing the spectrogram, waveform, and cross-attention tensors. - */ - async generate_speech(input_values, speaker_embeddings, { - threshold = 0.5, - minlenratio = 0.0, - maxlenratio = 20.0, - vocoder = null, - // output_cross_attentions = false, // TODO add - } = {}) { - - const model_inputs = { - input_ids: input_values - } - - const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs); - - const r = encoder_outputs.dims[1] / this.config.reduction_factor; - const maxlen = Math.floor(r * maxlenratio); - const minlen = Math.floor(r * minlenratio); - - const num_mel_bins = this.config.num_mel_bins; - - let spectrogramParts = []; - let past_key_values = null; - let decoder_outputs = null; - let idx = 0; - - while (true) { - ++idx; - - const use_cache_branch = boolTensor(!!decoder_outputs); - let output_sequence; - if (decoder_outputs) { - output_sequence = decoder_outputs.output_sequence_out; - } else { - output_sequence = new Tensor( - 'float32', - new Float32Array(num_mel_bins), - [1, 1, num_mel_bins], - ) - } - let decoderFeeds = { - use_cache_branch, - output_sequence, - encoder_attention_mask: encoder_attention_mask, - speaker_embeddings: speaker_embeddings, - encoder_hidden_states: encoder_outputs, - }; - - this.addPastKeyValues(decoderFeeds, past_key_values); - decoder_outputs = await sessionRun(this.decoder_merged_session, decoderFeeds); - past_key_values = this.getPastKeyValues(decoder_outputs, past_key_values); - - const { prob, spectrum } = decoder_outputs; - spectrogramParts.push(spectrum); - - if (idx >= minlen && ( - // Finished when stop token or maximum length is reached. - Array.from(prob.data).filter(p => p >= threshold).length > 0 || idx >= maxlen - )) { - break; - } - } - - const spectrogram = cat(spectrogramParts); - const { waveform } = await sessionRun(vocoder.session, { spectrogram }); - - return { - spectrogram, - waveform, - // cross_attentions: null, // TODO add - } - } + return { + spectrogram, + waveform, + // cross_attentions: null, // TODO add + }; + } } /** * HiFi-GAN vocoder. - * + * * See [SpeechT5ForSpeechToText](./models#module_models.SpeechT5ForSpeechToText) for example usage. */ export class SpeechT5HifiGan extends PreTrainedModel { - main_input_name = 'spectrogram'; + main_input_name = "spectrogram"; } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // TrOCR models export class TrOCRPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `TrOCRPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `TrOCRPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id; + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_encoder_layers = this.num_decoder_layers = this.config.decoder_layers; - this.num_encoder_heads = this.num_decoder_heads = this.config.decoder_attention_heads; - this.encoder_dim_kv = this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; - } + this.num_encoder_layers = this.num_decoder_layers = + this.config.decoder_layers; + this.num_encoder_heads = this.num_decoder_heads = + this.config.decoder_attention_heads; + this.encoder_dim_kv = this.decoder_dim_kv = + this.config.d_model / this.num_decoder_heads; + } } /** * The TrOCR Decoder with a language modeling head. */ -export class TrOCRForCausalLM extends TrOCRPreTrainedModel { } +export class TrOCRForCausalLM extends TrOCRPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // Mistral models /** * The bare Mistral Model outputting raw hidden-states without any specific head on top. */ export class MistralPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `MistralPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `MistralPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.num_key_value_heads; - this.num_layers = this.config.num_hidden_layers; - this.dim_kv = this.config.hidden_size / this.config.num_attention_heads; - } + this.num_heads = this.config.num_key_value_heads; + this.num_layers = this.config.num_hidden_layers; + this.dim_kv = this.config.hidden_size / this.config.num_attention_heads; + } } -export class MistralModel extends MistralPreTrainedModel { } +export class MistralModel extends MistralPreTrainedModel {} -export class MistralForCausalLM extends MistralPreTrainedModel { } +export class MistralForCausalLM extends MistralPreTrainedModel {} ////////////////////////////////////////////////// ////////////////////////////////////////////////// @@ -4891,53 +5115,52 @@ export class MistralForCausalLM extends MistralPreTrainedModel { } * The bare Falcon Model outputting raw hidden-states without any specific head on top. */ export class FalconPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `FalconPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config, session, generation_config) { - super(config, session); - this.generation_config = generation_config; + /** + * Creates a new instance of the `FalconPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, generation_config) { + super(config, session); + this.generation_config = generation_config; - // config doesn't contain pad_token_id, so we assume it is the eos_token_id - this.config.pad_token_id = this.config.eos_token_id + // config doesn't contain pad_token_id, so we assume it is the eos_token_id + this.config.pad_token_id = this.config.eos_token_id; - this.num_heads = this.config.num_attention_heads; - this.num_layers = this.config.num_hidden_layers; - this.dim_kv = this.config.hidden_size / this.config.num_attention_heads; - } + this.num_heads = this.config.num_attention_heads; + this.num_layers = this.config.num_hidden_layers; + this.dim_kv = this.config.hidden_size / this.config.num_attention_heads; + } } -export class FalconModel extends FalconPreTrainedModel { } +export class FalconModel extends FalconPreTrainedModel {} -export class FalconForCausalLM extends FalconPreTrainedModel { } +export class FalconForCausalLM extends FalconPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // CLAP models -export class ClapPreTrainedModel extends PreTrainedModel { } +export class ClapPreTrainedModel extends PreTrainedModel {} -export class ClapModel extends ClapPreTrainedModel { } +export class ClapModel extends ClapPreTrainedModel {} /** * CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output). - * + * * **Example:** Compute text embeddings with `ClapTextModelWithProjection`. - * + * * ```javascript * import { AutoTokenizer, ClapTextModelWithProjection } from '@xenova/transformers'; - * + * * // Load tokenizer and text model * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clap-htsat-unfused'); * const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused'); - * + * * // Run tokenization * const texts = ['a sound of a cat', 'a sound of a dog']; * const text_inputs = tokenizer(texts, { padding: true, truncation: true }); - * + * * // Compute embeddings * const { text_embeds } = await text_model(text_inputs); * // Tensor { @@ -4949,31 +5172,30 @@ export class ClapModel extends ClapPreTrainedModel { } * ``` */ export class ClapTextModelWithProjection extends ClapPreTrainedModel { - - /** @type {PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'text_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); - } + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + // Update default model file name if not provided + options.model_file_name ??= "text_model"; + return super.from_pretrained(pretrained_model_name_or_path, options); + } } /** * CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output). - * + * * **Example:** Compute audio embeddings with `ClapAudioModelWithProjection`. - * + * * ```javascript * import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@xenova/transformers'; - * + * * // Load processor and audio model * const processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused'); * const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused'); - * + * * // Read audio and run processor * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav'); * const audio_inputs = await processor(audio); - * + * * // Compute embeddings * const { audio_embeds } = await audio_model(audio_inputs); * // Tensor { @@ -4985,34 +5207,33 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel { * ``` */ export class ClapAudioModelWithProjection extends ClapPreTrainedModel { - /** @type {PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'audio_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); - } + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + // Update default model file name if not provided + options.model_file_name ??= "audio_model"; + return super.from_pretrained(pretrained_model_name_or_path, options); + } } ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // VITS models -export class VitsPreTrainedModel extends PreTrainedModel { } +export class VitsPreTrainedModel extends PreTrainedModel {} /** * The complete VITS model, for text-to-speech synthesis. - * + * * **Example:** Generate speech from text with `VitsModel`. * ```javascript * import { AutoTokenizer, VitsModel } from '@xenova/transformers'; - * + * * // Load the tokenizer and model * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/mms-tts-eng'); * const model = await VitsModel.from_pretrained('Xenova/mms-tts-eng'); - * + * * // Run tokenization * const inputs = tokenizer('I love transformers'); - * + * * // Generate waveform * const { waveform } = await model(inputs); * // Tensor { @@ -5024,39 +5245,38 @@ export class VitsPreTrainedModel extends PreTrainedModel { } * ``` */ export class VitsModel extends VitsPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} The outputs for the VITS model. - */ - async _call(model_inputs) { - return new VitsModelOutput(await super._call(model_inputs)); - } + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} The outputs for the VITS model. + */ + async _call(model_inputs) { + return new VitsModelOutput(await super._call(model_inputs)); + } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// // Segformer models -export class SegformerPreTrainedModel extends PreTrainedModel { } +export class SegformerPreTrainedModel extends PreTrainedModel {} /** * The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top. */ -export class SegformerModel extends SegformerPreTrainedModel { } +export class SegformerModel extends SegformerPreTrainedModel {} /** * SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden states) e.g. for ImageNet. */ -export class SegformerForImageClassification extends SegformerPreTrainedModel { } +export class SegformerForImageClassification extends SegformerPreTrainedModel {} /** * SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes. */ -export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel { } +export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel {} ////////////////////////////////////////////////// - ////////////////////////////////////////////////// // AutoModels, used to simplify construction of PreTrainedModels // (uses config to instantiate correct class) @@ -5066,729 +5286,904 @@ export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel { * which is used to instantiate pretrained models. */ export class PretrainedMixin { - /** - * Mapping from model type to model class. - * @type {Map[]} - */ - static MODEL_CLASS_MAPPINGS = null; + /** + * Mapping from model type to model class. + * @type {Map[]} + */ + static MODEL_CLASS_MAPPINGS = null; - /** - * Whether to attempt to instantiate the base class (`PretrainedModel`) if - * the model type is not found in the mapping. - */ - static BASE_IF_FAIL = false; + /** + * Whether to attempt to instantiate the base class (`PretrainedModel`) if + * the model type is not found in the mapping. + */ + static BASE_IF_FAIL = false; - - /** @type {PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, { - quantized = true, - progress_callback = null, - config = null, - cache_dir = null, - local_files_only = false, - revision = 'main', - model_file_name = null, - } = {}) { - - let options = { - quantized, - progress_callback, - config, - cache_dir, - local_files_only, - revision, - model_file_name, - } - config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options); - if (!options.config) { - // If no config was passed, reuse this config for future processing - options.config = config; - } - - if (!this.MODEL_CLASS_MAPPINGS) { - throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name); - } - - for (let MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) { - const modelInfo = MODEL_CLASS_MAPPING.get(config.model_type); - if (!modelInfo) { - continue; // Item not found in this mapping - } - return await modelInfo[1].from_pretrained(pretrained_model_name_or_path, options); - } - - if (this.BASE_IF_FAIL) { - console.warn(`Unknown model class "${config.model_type}", attempting to construct from base class.`); - return await PreTrainedModel.from_pretrained(pretrained_model_name_or_path, options); - } else { - throw Error(`Unsupported model type: ${config.model_type}`) - } + /** @type {PreTrainedModel.from_pretrained} */ + static async from_pretrained( + pretrained_model_name_or_path, + { + quantized = true, + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = "main", + model_file_name = null, + } = {}, + ) { + let options = { + quantized, + progress_callback, + config, + cache_dir, + local_files_only, + revision, + model_file_name, + }; + config = await AutoConfig.from_pretrained( + pretrained_model_name_or_path, + options, + ); + if (!options.config) { + // If no config was passed, reuse this config for future processing + options.config = config; } + + if (!this.MODEL_CLASS_MAPPINGS) { + throw new Error( + "`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + + this.name, + ); + } + + for (let MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) { + const modelInfo = MODEL_CLASS_MAPPING.get(config.model_type); + if (!modelInfo) { + continue; // Item not found in this mapping + } + return await modelInfo[1].from_pretrained( + pretrained_model_name_or_path, + options, + ); + } + + if (this.BASE_IF_FAIL) { + console.warn( + `Unknown model class "${config.model_type}", attempting to construct from base class.`, + ); + return await PreTrainedModel.from_pretrained( + pretrained_model_name_or_path, + options, + ); + } else { + throw Error(`Unsupported model type: ${config.model_type}`); + } + } } const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ - ['bert', ['BertModel', BertModel]], - ['roformer', ['RoFormerModel', RoFormerModel]], - ['electra', ['ElectraModel', ElectraModel]], - ['esm', ['EsmModel', EsmModel]], - ['convbert', ['ConvBertModel', ConvBertModel]], - ['camembert', ['CamembertModel', CamembertModel]], - ['deberta', ['DebertaModel', DebertaModel]], - ['deberta-v2', ['DebertaV2Model', DebertaV2Model]], - ['mpnet', ['MPNetModel', MPNetModel]], - ['albert', ['AlbertModel', AlbertModel]], - ['distilbert', ['DistilBertModel', DistilBertModel]], - ['roberta', ['RobertaModel', RobertaModel]], - ['xlm', ['XLMModel', XLMModel]], - ['xlm-roberta', ['XLMRobertaModel', XLMRobertaModel]], - ['clap', ['ClapModel', ClapModel]], - ['clip', ['CLIPModel', CLIPModel]], - ['clipseg', ['CLIPSegModel', CLIPSegModel]], - ['chinese_clip', ['ChineseCLIPModel', ChineseCLIPModel]], - ['siglip', ['SiglipModel', SiglipModel]], - ['mobilebert', ['MobileBertModel', MobileBertModel]], - ['squeezebert', ['SqueezeBertModel', SqueezeBertModel]], - ['wav2vec2', ['Wav2Vec2Model', Wav2Vec2Model]], - ['hubert', ['HubertModel', HubertModel]], - ['wavlm', ['WavLMModel', WavLMModel]], - ['audio-spectrogram-transformer', ['ASTModel', ASTModel]], - ['vits', ['VitsModel', VitsModel]], + ["bert", ["BertModel", BertModel]], + ["roformer", ["RoFormerModel", RoFormerModel]], + ["electra", ["ElectraModel", ElectraModel]], + ["esm", ["EsmModel", EsmModel]], + ["convbert", ["ConvBertModel", ConvBertModel]], + ["camembert", ["CamembertModel", CamembertModel]], + ["deberta", ["DebertaModel", DebertaModel]], + ["deberta-v2", ["DebertaV2Model", DebertaV2Model]], + ["mpnet", ["MPNetModel", MPNetModel]], + ["albert", ["AlbertModel", AlbertModel]], + ["distilbert", ["DistilBertModel", DistilBertModel]], + ["roberta", ["RobertaModel", RobertaModel]], + ["xlm", ["XLMModel", XLMModel]], + ["xlm-roberta", ["XLMRobertaModel", XLMRobertaModel]], + ["clap", ["ClapModel", ClapModel]], + ["clip", ["CLIPModel", CLIPModel]], + ["clipseg", ["CLIPSegModel", CLIPSegModel]], + ["chinese_clip", ["ChineseCLIPModel", ChineseCLIPModel]], + ["siglip", ["SiglipModel", SiglipModel]], + ["mobilebert", ["MobileBertModel", MobileBertModel]], + ["squeezebert", ["SqueezeBertModel", SqueezeBertModel]], + ["wav2vec2", ["Wav2Vec2Model", Wav2Vec2Model]], + ["hubert", ["HubertModel", HubertModel]], + ["wavlm", ["WavLMModel", WavLMModel]], + ["audio-spectrogram-transformer", ["ASTModel", ASTModel]], + ["vits", ["VitsModel", VitsModel]], - ['detr', ['DetrModel', DetrModel]], - ['table-transformer', ['TableTransformerModel', TableTransformerModel]], - ['vit', ['ViTModel', ViTModel]], - ['mobilevit', ['MobileViTModel', MobileViTModel]], - ['owlvit', ['OwlViTModel', OwlViTModel]], - ['beit', ['BeitModel', BeitModel]], - ['deit', ['DeiTModel', DeiTModel]], - ['convnext', ['ConvNextModel', ConvNextModel]], - ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]], - ['dinov2', ['Dinov2Model', Dinov2Model]], - ['resnet', ['ResNetModel', ResNetModel]], - ['swin', ['SwinModel', SwinModel]], - ['swin2sr', ['Swin2SRModel', Swin2SRModel]], - ['donut-swin', ['DonutSwinModel', DonutSwinModel]], - ['yolos', ['YolosModel', YolosModel]], - ['dpt', ['DPTModel', DPTModel]], - ['glpn', ['GLPNModel', GLPNModel]], - - ['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]], + ["detr", ["DetrModel", DetrModel]], + ["table-transformer", ["TableTransformerModel", TableTransformerModel]], + ["vit", ["ViTModel", ViTModel]], + ["mobilevit", ["MobileViTModel", MobileViTModel]], + ["owlvit", ["OwlViTModel", OwlViTModel]], + ["beit", ["BeitModel", BeitModel]], + ["deit", ["DeiTModel", DeiTModel]], + ["convnext", ["ConvNextModel", ConvNextModel]], + ["convnextv2", ["ConvNextV2Model", ConvNextV2Model]], + ["dinov2", ["Dinov2Model", Dinov2Model]], + ["resnet", ["ResNetModel", ResNetModel]], + ["swin", ["SwinModel", SwinModel]], + ["swin2sr", ["Swin2SRModel", Swin2SRModel]], + ["donut-swin", ["DonutSwinModel", DonutSwinModel]], + ["yolos", ["YolosModel", YolosModel]], + ["dpt", ["DPTModel", DPTModel]], + ["glpn", ["GLPNModel", GLPNModel]], + ["hifigan", ["SpeechT5HifiGan", SpeechT5HifiGan]], ]); const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([ - ['t5', ['T5Model', T5Model]], - ['longt5', ['LongT5Model', LongT5Model]], - ['mt5', ['MT5Model', MT5Model]], - ['bart', ['BartModel', BartModel]], - ['mbart', ['MBartModel', MBartModel]], - ['marian', ['MarianModel', MarianModel]], - ['whisper', ['WhisperModel', WhisperModel]], - ['m2m_100', ['M2M100Model', M2M100Model]], - ['blenderbot', ['BlenderbotModel', BlenderbotModel]], - ['blenderbot-small', ['BlenderbotSmallModel', BlenderbotSmallModel]], + ["t5", ["T5Model", T5Model]], + ["longt5", ["LongT5Model", LongT5Model]], + ["mt5", ["MT5Model", MT5Model]], + ["bart", ["BartModel", BartModel]], + ["mbart", ["MBartModel", MBartModel]], + ["marian", ["MarianModel", MarianModel]], + ["whisper", ["WhisperModel", WhisperModel]], + ["m2m_100", ["M2M100Model", M2M100Model]], + ["blenderbot", ["BlenderbotModel", BlenderbotModel]], + ["blenderbot-small", ["BlenderbotSmallModel", BlenderbotSmallModel]], ]); - const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([ - ['bloom', ['BloomModel', BloomModel]], - ['gpt2', ['GPT2Model', GPT2Model]], - ['gptj', ['GPTJModel', GPTJModel]], - ['gpt_bigcode', ['GPTBigCodeModel', GPTBigCodeModel]], - ['gpt_neo', ['GPTNeoModel', GPTNeoModel]], - ['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]], - ['codegen', ['CodeGenModel', CodeGenModel]], - ['llama', ['LlamaModel', LlamaModel]], - ['phi', ['PhiModel', PhiModel]], - ['mpt', ['MptModel', MptModel]], - ['opt', ['OPTModel', OPTModel]], - ['mistral', ['MistralModel', MistralModel]], - ['falcon', ['FalconModel', FalconModel]], + ["bloom", ["BloomModel", BloomModel]], + ["gpt2", ["GPT2Model", GPT2Model]], + ["gptj", ["GPTJModel", GPTJModel]], + ["gpt_bigcode", ["GPTBigCodeModel", GPTBigCodeModel]], + ["gpt_neo", ["GPTNeoModel", GPTNeoModel]], + ["gpt_neox", ["GPTNeoXModel", GPTNeoXModel]], + ["codegen", ["CodeGenModel", CodeGenModel]], + ["llama", ["LlamaModel", LlamaModel]], + ["phi", ["PhiModel", PhiModel]], + ["mpt", ["MptModel", MptModel]], + ["opt", ["OPTModel", OPTModel]], + ["mistral", ["MistralModel", MistralModel]], + ["falcon", ["FalconModel", FalconModel]], ]); const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([ - ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]], - ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]], + ["speecht5", ["SpeechT5ForSpeechToText", SpeechT5ForSpeechToText]], + [ + "whisper", + ["WhisperForConditionalGeneration", WhisperForConditionalGeneration], + ], ]); const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([ - ['speecht5', ['SpeechT5ForTextToSpeech', SpeechT5ForTextToSpeech]], + ["speecht5", ["SpeechT5ForTextToSpeech", SpeechT5ForTextToSpeech]], ]); const MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = new Map([ - ['vits', ['VitsModel', VitsModel]], + ["vits", ["VitsModel", VitsModel]], ]); const MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = new Map([ - ['bert', ['BertForSequenceClassification', BertForSequenceClassification]], - ['roformer', ['RoFormerForSequenceClassification', RoFormerForSequenceClassification]], - ['electra', ['ElectraForSequenceClassification', ElectraForSequenceClassification]], - ['esm', ['EsmForSequenceClassification', EsmForSequenceClassification]], - ['convbert', ['ConvBertForSequenceClassification', ConvBertForSequenceClassification]], - ['camembert', ['CamembertForSequenceClassification', CamembertForSequenceClassification]], - ['deberta', ['DebertaForSequenceClassification', DebertaForSequenceClassification]], - ['deberta-v2', ['DebertaV2ForSequenceClassification', DebertaV2ForSequenceClassification]], - ['mpnet', ['MPNetForSequenceClassification', MPNetForSequenceClassification]], - ['albert', ['AlbertForSequenceClassification', AlbertForSequenceClassification]], - ['distilbert', ['DistilBertForSequenceClassification', DistilBertForSequenceClassification]], - ['roberta', ['RobertaForSequenceClassification', RobertaForSequenceClassification]], - ['xlm', ['XLMForSequenceClassification', XLMForSequenceClassification]], - ['xlm-roberta', ['XLMRobertaForSequenceClassification', XLMRobertaForSequenceClassification]], - ['bart', ['BartForSequenceClassification', BartForSequenceClassification]], - ['mbart', ['MBartForSequenceClassification', MBartForSequenceClassification]], - ['mobilebert', ['MobileBertForSequenceClassification', MobileBertForSequenceClassification]], - ['squeezebert', ['SqueezeBertForSequenceClassification', SqueezeBertForSequenceClassification]], + ["bert", ["BertForSequenceClassification", BertForSequenceClassification]], + [ + "roformer", + ["RoFormerForSequenceClassification", RoFormerForSequenceClassification], + ], + [ + "electra", + ["ElectraForSequenceClassification", ElectraForSequenceClassification], + ], + ["esm", ["EsmForSequenceClassification", EsmForSequenceClassification]], + [ + "convbert", + ["ConvBertForSequenceClassification", ConvBertForSequenceClassification], + ], + [ + "camembert", + ["CamembertForSequenceClassification", CamembertForSequenceClassification], + ], + [ + "deberta", + ["DebertaForSequenceClassification", DebertaForSequenceClassification], + ], + [ + "deberta-v2", + ["DebertaV2ForSequenceClassification", DebertaV2ForSequenceClassification], + ], + ["mpnet", ["MPNetForSequenceClassification", MPNetForSequenceClassification]], + [ + "albert", + ["AlbertForSequenceClassification", AlbertForSequenceClassification], + ], + [ + "distilbert", + [ + "DistilBertForSequenceClassification", + DistilBertForSequenceClassification, + ], + ], + [ + "roberta", + ["RobertaForSequenceClassification", RobertaForSequenceClassification], + ], + ["xlm", ["XLMForSequenceClassification", XLMForSequenceClassification]], + [ + "xlm-roberta", + [ + "XLMRobertaForSequenceClassification", + XLMRobertaForSequenceClassification, + ], + ], + ["bart", ["BartForSequenceClassification", BartForSequenceClassification]], + ["mbart", ["MBartForSequenceClassification", MBartForSequenceClassification]], + [ + "mobilebert", + [ + "MobileBertForSequenceClassification", + MobileBertForSequenceClassification, + ], + ], + [ + "squeezebert", + [ + "SqueezeBertForSequenceClassification", + SqueezeBertForSequenceClassification, + ], + ], ]); const MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = new Map([ - ['bert', ['BertForTokenClassification', BertForTokenClassification]], - ['roformer', ['RoFormerForTokenClassification', RoFormerForTokenClassification]], - ['electra', ['ElectraForTokenClassification', ElectraForTokenClassification]], - ['esm', ['EsmForTokenClassification', EsmForTokenClassification]], - ['convbert', ['ConvBertForTokenClassification', ConvBertForTokenClassification]], - ['camembert', ['CamembertForTokenClassification', CamembertForTokenClassification]], - ['deberta', ['DebertaForTokenClassification', DebertaForTokenClassification]], - ['deberta-v2', ['DebertaV2ForTokenClassification', DebertaV2ForTokenClassification]], - ['mpnet', ['MPNetForTokenClassification', MPNetForTokenClassification]], - ['distilbert', ['DistilBertForTokenClassification', DistilBertForTokenClassification]], - ['roberta', ['RobertaForTokenClassification', RobertaForTokenClassification]], - ['xlm', ['XLMForTokenClassification', XLMForTokenClassification]], - ['xlm-roberta', ['XLMRobertaForTokenClassification', XLMRobertaForTokenClassification]], + ["bert", ["BertForTokenClassification", BertForTokenClassification]], + [ + "roformer", + ["RoFormerForTokenClassification", RoFormerForTokenClassification], + ], + ["electra", ["ElectraForTokenClassification", ElectraForTokenClassification]], + ["esm", ["EsmForTokenClassification", EsmForTokenClassification]], + [ + "convbert", + ["ConvBertForTokenClassification", ConvBertForTokenClassification], + ], + [ + "camembert", + ["CamembertForTokenClassification", CamembertForTokenClassification], + ], + ["deberta", ["DebertaForTokenClassification", DebertaForTokenClassification]], + [ + "deberta-v2", + ["DebertaV2ForTokenClassification", DebertaV2ForTokenClassification], + ], + ["mpnet", ["MPNetForTokenClassification", MPNetForTokenClassification]], + [ + "distilbert", + ["DistilBertForTokenClassification", DistilBertForTokenClassification], + ], + ["roberta", ["RobertaForTokenClassification", RobertaForTokenClassification]], + ["xlm", ["XLMForTokenClassification", XLMForTokenClassification]], + [ + "xlm-roberta", + ["XLMRobertaForTokenClassification", XLMRobertaForTokenClassification], + ], ]); const MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = new Map([ - ['t5', ['T5ForConditionalGeneration', T5ForConditionalGeneration]], - ['longt5', ['LongT5ForConditionalGeneration', LongT5ForConditionalGeneration]], - ['mt5', ['MT5ForConditionalGeneration', MT5ForConditionalGeneration]], - ['bart', ['BartForConditionalGeneration', BartForConditionalGeneration]], - ['mbart', ['MBartForConditionalGeneration', MBartForConditionalGeneration]], - ['marian', ['MarianMTModel', MarianMTModel]], - ['m2m_100', ['M2M100ForConditionalGeneration', M2M100ForConditionalGeneration]], - ['blenderbot', ['BlenderbotForConditionalGeneration', BlenderbotForConditionalGeneration]], - ['blenderbot-small', ['BlenderbotSmallForConditionalGeneration', BlenderbotSmallForConditionalGeneration]], + ["t5", ["T5ForConditionalGeneration", T5ForConditionalGeneration]], + [ + "longt5", + ["LongT5ForConditionalGeneration", LongT5ForConditionalGeneration], + ], + ["mt5", ["MT5ForConditionalGeneration", MT5ForConditionalGeneration]], + ["bart", ["BartForConditionalGeneration", BartForConditionalGeneration]], + ["mbart", ["MBartForConditionalGeneration", MBartForConditionalGeneration]], + ["marian", ["MarianMTModel", MarianMTModel]], + [ + "m2m_100", + ["M2M100ForConditionalGeneration", M2M100ForConditionalGeneration], + ], + [ + "blenderbot", + ["BlenderbotForConditionalGeneration", BlenderbotForConditionalGeneration], + ], + [ + "blenderbot-small", + [ + "BlenderbotSmallForConditionalGeneration", + BlenderbotSmallForConditionalGeneration, + ], + ], ]); const MODEL_WITH_LM_HEAD_MAPPING_NAMES = new Map([ - ['bloom', ['BloomForCausalLM', BloomForCausalLM]], - ['gpt2', ['GPT2LMHeadModel', GPT2LMHeadModel]], - ['gptj', ['GPTJForCausalLM', GPTJForCausalLM]], - ['gpt_bigcode', ['GPTBigCodeForCausalLM', GPTBigCodeForCausalLM]], - ['gpt_neo', ['GPTNeoForCausalLM', GPTNeoForCausalLM]], - ['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]], - ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]], - ['llama', ['LlamaForCausalLM', LlamaForCausalLM]], - ['phi', ['PhiForCausalLM', PhiForCausalLM]], - ['mpt', ['MptForCausalLM', MptForCausalLM]], - ['opt', ['OPTForCausalLM', OPTForCausalLM]], - ['mbart', ['MBartForCausalLM', MBartForCausalLM]], - ['mistral', ['MistralForCausalLM', MistralForCausalLM]], - ['falcon', ['FalconForCausalLM', FalconForCausalLM]], - ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]], + ["bloom", ["BloomForCausalLM", BloomForCausalLM]], + ["gpt2", ["GPT2LMHeadModel", GPT2LMHeadModel]], + ["gptj", ["GPTJForCausalLM", GPTJForCausalLM]], + ["gpt_bigcode", ["GPTBigCodeForCausalLM", GPTBigCodeForCausalLM]], + ["gpt_neo", ["GPTNeoForCausalLM", GPTNeoForCausalLM]], + ["gpt_neox", ["GPTNeoXForCausalLM", GPTNeoXForCausalLM]], + ["codegen", ["CodeGenForCausalLM", CodeGenForCausalLM]], + ["llama", ["LlamaForCausalLM", LlamaForCausalLM]], + ["phi", ["PhiForCausalLM", PhiForCausalLM]], + ["mpt", ["MptForCausalLM", MptForCausalLM]], + ["opt", ["OPTForCausalLM", OPTForCausalLM]], + ["mbart", ["MBartForCausalLM", MBartForCausalLM]], + ["mistral", ["MistralForCausalLM", MistralForCausalLM]], + ["falcon", ["FalconForCausalLM", FalconForCausalLM]], + ["trocr", ["TrOCRForCausalLM", TrOCRForCausalLM]], ]); const MODEL_FOR_MASKED_LM_MAPPING_NAMES = new Map([ - ['bert', ['BertForMaskedLM', BertForMaskedLM]], - ['roformer', ['RoFormerForMaskedLM', RoFormerForMaskedLM]], - ['electra', ['ElectraForMaskedLM', ElectraForMaskedLM]], - ['esm', ['EsmForMaskedLM', EsmForMaskedLM]], - ['convbert', ['ConvBertForMaskedLM', ConvBertForMaskedLM]], - ['camembert', ['CamembertForMaskedLM', CamembertForMaskedLM]], - ['deberta', ['DebertaForMaskedLM', DebertaForMaskedLM]], - ['deberta-v2', ['DebertaV2ForMaskedLM', DebertaV2ForMaskedLM]], - ['mpnet', ['MPNetForMaskedLM', MPNetForMaskedLM]], - ['albert', ['AlbertForMaskedLM', AlbertForMaskedLM]], - ['distilbert', ['DistilBertForMaskedLM', DistilBertForMaskedLM]], - ['roberta', ['RobertaForMaskedLM', RobertaForMaskedLM]], - ['xlm', ['XLMWithLMHeadModel', XLMWithLMHeadModel]], - ['xlm-roberta', ['XLMRobertaForMaskedLM', XLMRobertaForMaskedLM]], - ['mobilebert', ['MobileBertForMaskedLM', MobileBertForMaskedLM]], - ['squeezebert', ['SqueezeBertForMaskedLM', SqueezeBertForMaskedLM]], + ["bert", ["BertForMaskedLM", BertForMaskedLM]], + ["roformer", ["RoFormerForMaskedLM", RoFormerForMaskedLM]], + ["electra", ["ElectraForMaskedLM", ElectraForMaskedLM]], + ["esm", ["EsmForMaskedLM", EsmForMaskedLM]], + ["convbert", ["ConvBertForMaskedLM", ConvBertForMaskedLM]], + ["camembert", ["CamembertForMaskedLM", CamembertForMaskedLM]], + ["deberta", ["DebertaForMaskedLM", DebertaForMaskedLM]], + ["deberta-v2", ["DebertaV2ForMaskedLM", DebertaV2ForMaskedLM]], + ["mpnet", ["MPNetForMaskedLM", MPNetForMaskedLM]], + ["albert", ["AlbertForMaskedLM", AlbertForMaskedLM]], + ["distilbert", ["DistilBertForMaskedLM", DistilBertForMaskedLM]], + ["roberta", ["RobertaForMaskedLM", RobertaForMaskedLM]], + ["xlm", ["XLMWithLMHeadModel", XLMWithLMHeadModel]], + ["xlm-roberta", ["XLMRobertaForMaskedLM", XLMRobertaForMaskedLM]], + ["mobilebert", ["MobileBertForMaskedLM", MobileBertForMaskedLM]], + ["squeezebert", ["SqueezeBertForMaskedLM", SqueezeBertForMaskedLM]], ]); const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([ - ['bert', ['BertForQuestionAnswering', BertForQuestionAnswering]], - ['roformer', ['RoFormerForQuestionAnswering', RoFormerForQuestionAnswering]], - ['electra', ['ElectraForQuestionAnswering', ElectraForQuestionAnswering]], - ['convbert', ['ConvBertForQuestionAnswering', ConvBertForQuestionAnswering]], - ['camembert', ['CamembertForQuestionAnswering', CamembertForQuestionAnswering]], - ['deberta', ['DebertaForQuestionAnswering', DebertaForQuestionAnswering]], - ['deberta-v2', ['DebertaV2ForQuestionAnswering', DebertaV2ForQuestionAnswering]], - ['mpnet', ['MPNetForQuestionAnswering', MPNetForQuestionAnswering]], - ['albert', ['AlbertForQuestionAnswering', AlbertForQuestionAnswering]], - ['distilbert', ['DistilBertForQuestionAnswering', DistilBertForQuestionAnswering]], - ['roberta', ['RobertaForQuestionAnswering', RobertaForQuestionAnswering]], - ['xlm', ['XLMForQuestionAnswering', XLMForQuestionAnswering]], - ['xlm-roberta', ['XLMRobertaForQuestionAnswering', XLMRobertaForQuestionAnswering]], - ['mobilebert', ['MobileBertForQuestionAnswering', MobileBertForQuestionAnswering]], - ['squeezebert', ['SqueezeBertForQuestionAnswering', SqueezeBertForQuestionAnswering]], + ["bert", ["BertForQuestionAnswering", BertForQuestionAnswering]], + ["roformer", ["RoFormerForQuestionAnswering", RoFormerForQuestionAnswering]], + ["electra", ["ElectraForQuestionAnswering", ElectraForQuestionAnswering]], + ["convbert", ["ConvBertForQuestionAnswering", ConvBertForQuestionAnswering]], + [ + "camembert", + ["CamembertForQuestionAnswering", CamembertForQuestionAnswering], + ], + ["deberta", ["DebertaForQuestionAnswering", DebertaForQuestionAnswering]], + [ + "deberta-v2", + ["DebertaV2ForQuestionAnswering", DebertaV2ForQuestionAnswering], + ], + ["mpnet", ["MPNetForQuestionAnswering", MPNetForQuestionAnswering]], + ["albert", ["AlbertForQuestionAnswering", AlbertForQuestionAnswering]], + [ + "distilbert", + ["DistilBertForQuestionAnswering", DistilBertForQuestionAnswering], + ], + ["roberta", ["RobertaForQuestionAnswering", RobertaForQuestionAnswering]], + ["xlm", ["XLMForQuestionAnswering", XLMForQuestionAnswering]], + [ + "xlm-roberta", + ["XLMRobertaForQuestionAnswering", XLMRobertaForQuestionAnswering], + ], + [ + "mobilebert", + ["MobileBertForQuestionAnswering", MobileBertForQuestionAnswering], + ], + [ + "squeezebert", + ["SqueezeBertForQuestionAnswering", SqueezeBertForQuestionAnswering], + ], ]); const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([ - ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]], + [ + "vision-encoder-decoder", + ["VisionEncoderDecoderModel", VisionEncoderDecoderModel], + ], ]); const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([ - ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]], + [ + "vision-encoder-decoder", + ["VisionEncoderDecoderModel", VisionEncoderDecoderModel], + ], ]); const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([ - ['vit', ['ViTForImageClassification', ViTForImageClassification]], - ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]], - ['beit', ['BeitForImageClassification', BeitForImageClassification]], - ['deit', ['DeiTForImageClassification', DeiTForImageClassification]], - ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]], - ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]], - ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]], - ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]], - ['swin', ['SwinForImageClassification', SwinForImageClassification]], - ['segformer', ['SegformerForImageClassification', SegformerForImageClassification]], + ["vit", ["ViTForImageClassification", ViTForImageClassification]], + [ + "mobilevit", + ["MobileViTForImageClassification", MobileViTForImageClassification], + ], + ["beit", ["BeitForImageClassification", BeitForImageClassification]], + ["deit", ["DeiTForImageClassification", DeiTForImageClassification]], + [ + "convnext", + ["ConvNextForImageClassification", ConvNextForImageClassification], + ], + [ + "convnextv2", + ["ConvNextV2ForImageClassification", ConvNextV2ForImageClassification], + ], + ["dinov2", ["Dinov2ForImageClassification", Dinov2ForImageClassification]], + ["resnet", ["ResNetForImageClassification", ResNetForImageClassification]], + ["swin", ["SwinForImageClassification", SwinForImageClassification]], + [ + "segformer", + ["SegformerForImageClassification", SegformerForImageClassification], + ], ]); const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([ - ['detr', ['DetrForObjectDetection', DetrForObjectDetection]], - ['table-transformer', ['TableTransformerForObjectDetection', TableTransformerForObjectDetection]], - ['yolos', ['YolosForObjectDetection', YolosForObjectDetection]], + ["detr", ["DetrForObjectDetection", DetrForObjectDetection]], + [ + "table-transformer", + ["TableTransformerForObjectDetection", TableTransformerForObjectDetection], + ], + ["yolos", ["YolosForObjectDetection", YolosForObjectDetection]], ]); const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([ - ['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]], + ["owlvit", ["OwlViTForObjectDetection", OwlViTForObjectDetection]], ]); const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([ - ['detr', ['DetrForSegmentation', DetrForSegmentation]], - ['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]], + ["detr", ["DetrForSegmentation", DetrForSegmentation]], + ["clipseg", ["CLIPSegForImageSegmentation", CLIPSegForImageSegmentation]], ]); const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([ - ['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]], + [ + "segformer", + ["SegformerForSemanticSegmentation", SegformerForSemanticSegmentation], + ], ]); const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([ - ['sam', ['SamModel', SamModel]], + ["sam", ["SamModel", SamModel]], ]); const MODEL_FOR_CTC_MAPPING_NAMES = new Map([ - ['wav2vec2', ['Wav2Vec2ForCTC', Wav2Vec2ForCTC]], - ['wavlm', ['WavLMForCTC', WavLMForCTC]], - ['hubert', ['HubertForCTC', HubertForCTC]], + ["wav2vec2", ["Wav2Vec2ForCTC", Wav2Vec2ForCTC]], + ["wavlm", ["WavLMForCTC", WavLMForCTC]], + ["hubert", ["HubertForCTC", HubertForCTC]], ]); const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([ - ['wav2vec2', ['Wav2Vec2ForSequenceClassification', Wav2Vec2ForSequenceClassification]], - ['wavlm', ['WavLMForSequenceClassification', WavLMForSequenceClassification]], - ['hubert', ['HubertForSequenceClassification', HubertForSequenceClassification]], - ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]], + [ + "wav2vec2", + ["Wav2Vec2ForSequenceClassification", Wav2Vec2ForSequenceClassification], + ], + ["wavlm", ["WavLMForSequenceClassification", WavLMForSequenceClassification]], + [ + "hubert", + ["HubertForSequenceClassification", HubertForSequenceClassification], + ], + [ + "audio-spectrogram-transformer", + ["ASTForAudioClassification", ASTForAudioClassification], + ], ]); const MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = new Map([ - ['vitmatte', ['VitMatteForImageMatting', VitMatteForImageMatting]], + ["vitmatte", ["VitMatteForImageMatting", VitMatteForImageMatting]], ]); const MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = new Map([ - ['swin2sr', ['Swin2SRForImageSuperResolution', Swin2SRForImageSuperResolution]], -]) + [ + "swin2sr", + ["Swin2SRForImageSuperResolution", Swin2SRForImageSuperResolution], + ], +]); const MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = new Map([ - ['dpt', ['DPTForDepthEstimation', DPTForDepthEstimation]], - ['glpn', ['GLPNForDepthEstimation', GLPNForDepthEstimation]], -]) - + ["dpt", ["DPTForDepthEstimation", DPTForDepthEstimation]], + ["glpn", ["GLPNForDepthEstimation", GLPNForDepthEstimation]], +]); const MODEL_CLASS_TYPE_MAPPING = [ - [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES.EncoderOnly], - [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES.EncoderDecoder], - [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES.DecoderOnly], - [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], - [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], - [MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_TYPES.DecoderOnly], - [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq], - [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES.MaskGeneration], - [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], - [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES.EncoderOnly], + [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES.EncoderDecoder], + [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES.DecoderOnly], + [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], + [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], + [MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_TYPES.DecoderOnly], + [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq], + [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES.MaskGeneration], + [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], + [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], ]; for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) { - // @ts-ignore - for (const [name, model] of mappings.values()) { - MODEL_TYPE_MAPPING.set(name, type); - MODEL_CLASS_TO_NAME_MAPPING.set(model, name); - MODEL_NAME_TO_CLASS_MAPPING.set(name, model); - } -} - -const CUSTOM_MAPPING = [ - ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly], - ['CLIPVisionModelWithProjection', CLIPVisionModelWithProjection, MODEL_TYPES.EncoderOnly], - ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly], - ['SiglipVisionModel', SiglipVisionModel, MODEL_TYPES.EncoderOnly], - ['ClapTextModelWithProjection', ClapTextModelWithProjection, MODEL_TYPES.EncoderOnly], - ['ClapAudioModelWithProjection', ClapAudioModelWithProjection, MODEL_TYPES.EncoderOnly], -] -for (const [name, model, type] of CUSTOM_MAPPING) { + // @ts-ignore + for (const [name, model] of mappings.values()) { MODEL_TYPE_MAPPING.set(name, type); MODEL_CLASS_TO_NAME_MAPPING.set(model, name); MODEL_NAME_TO_CLASS_MAPPING.set(name, model); + } } +const CUSTOM_MAPPING = [ + [ + "CLIPTextModelWithProjection", + CLIPTextModelWithProjection, + MODEL_TYPES.EncoderOnly, + ], + [ + "CLIPVisionModelWithProjection", + CLIPVisionModelWithProjection, + MODEL_TYPES.EncoderOnly, + ], + ["SiglipTextModel", SiglipTextModel, MODEL_TYPES.EncoderOnly], + ["SiglipVisionModel", SiglipVisionModel, MODEL_TYPES.EncoderOnly], + [ + "ClapTextModelWithProjection", + ClapTextModelWithProjection, + MODEL_TYPES.EncoderOnly, + ], + [ + "ClapAudioModelWithProjection", + ClapAudioModelWithProjection, + MODEL_TYPES.EncoderOnly, + ], +]; +for (const [name, model, type] of CUSTOM_MAPPING) { + MODEL_TYPE_MAPPING.set(name, type); + MODEL_CLASS_TO_NAME_MAPPING.set(model, name); + MODEL_NAME_TO_CLASS_MAPPING.set(name, model); +} /** * Helper class which is used to instantiate pretrained models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModel.from_pretrained('bert-base-uncased'); */ export class AutoModel extends PretrainedMixin { - /** @type {Map[]} */ - // @ts-ignore - static MODEL_CLASS_MAPPINGS = MODEL_CLASS_TYPE_MAPPING.map(x => x[0]); - static BASE_IF_FAIL = true; + /** @type {Map[]} */ + // @ts-ignore + static MODEL_CLASS_MAPPINGS = MODEL_CLASS_TYPE_MAPPING.map((x) => x[0]); + static BASE_IF_FAIL = true; } /** * Helper class which is used to instantiate pretrained sequence classification models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english'); */ export class AutoModelForSequenceClassification extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [ + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, + ]; } /** * Helper class which is used to instantiate pretrained token classification models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForTokenClassification.from_pretrained('Davlan/distilbert-base-multilingual-cased-ner-hrl'); */ export class AutoModelForTokenClassification extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained sequence-to-sequence models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForSeq2SeqLM.from_pretrained('t5-small'); */ export class AutoModelForSeq2SeqLM extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained sequence-to-sequence speech-to-text models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForSpeechSeq2Seq.from_pretrained('openai/whisper-tiny.en'); */ export class AutoModelForSpeechSeq2Seq extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained sequence-to-sequence text-to-spectrogram models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForTextToSpectrogram.from_pretrained('microsoft/speecht5_tts'); */ export class AutoModelForTextToSpectrogram extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained text-to-waveform models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForTextToSpectrogram.from_pretrained('facebook/mms-tts-eng'); */ export class AutoModelForTextToWaveform extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained causal language models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForCausalLM.from_pretrained('gpt2'); */ export class AutoModelForCausalLM extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_WITH_LM_HEAD_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_WITH_LM_HEAD_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained masked language models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForMaskedLM.from_pretrained('bert-base-uncased'); */ export class AutoModelForMaskedLM extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASKED_LM_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASKED_LM_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained question answering models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad'); */ export class AutoModelForQuestionAnswering extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained vision-to-sequence models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForVision2Seq.from_pretrained('nlpconnect/vit-gpt2-image-captioning'); */ export class AutoModelForVision2Seq extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained image classification models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForImageClassification.from_pretrained('google/vit-base-patch16-224'); */ export class AutoModelForImageClassification extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained image segmentation models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForImageSegmentation.from_pretrained('facebook/detr-resnet-50-panoptic'); */ export class AutoModelForImageSegmentation extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained image segmentation models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForSemanticSegmentation.from_pretrained('nvidia/segformer-b3-finetuned-cityscapes-1024-1024'); */ export class AutoModelForSemanticSegmentation extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES]; } /** * Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForObjectDetection.from_pretrained('facebook/detr-resnet-50'); */ export class AutoModelForObjectDetection extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES]; } export class AutoModelForZeroShotObjectDetection extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [ + MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, + ]; } - /** * Helper class which is used to instantiate pretrained mask generation models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. - * + * * @example * let model = await AutoModelForMaskGeneration.from_pretrained('Xenova/sam-vit-base'); */ export class AutoModelForMaskGeneration extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]; } export class AutoModelForCTC extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_CTC_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_CTC_MAPPING_NAMES]; } export class AutoModelForAudioClassification extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]; } export class AutoModelForDocumentQuestionAnswering extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [ + MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES, + ]; } export class AutoModelForImageMatting extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]; } export class AutoModelForImageToImage extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]; } export class AutoModelForDepthEstimation extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS = [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES]; + static MODEL_CLASS_MAPPINGS = [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES]; } ////////////////////////////////////////////////// ////////////////////////////////////////////////// export class Seq2SeqLMOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits The output logits of the model. - * @param {Tensor} output.past_key_values An tensor of key/value pairs that represent the previous state of the model. - * @param {Tensor} output.encoder_outputs The output of the encoder in a sequence-to-sequence model. - * @param {Tensor} [output.decoder_attentions] Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. - * @param {Tensor} [output.cross_attentions] Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. - */ - constructor({ logits, past_key_values, encoder_outputs, decoder_attentions = null, cross_attentions = null }) { - super(); - this.logits = logits; - this.past_key_values = past_key_values; - this.encoder_outputs = encoder_outputs; - this.decoder_attentions = decoder_attentions; - this.cross_attentions = cross_attentions; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits The output logits of the model. + * @param {Tensor} output.past_key_values An tensor of key/value pairs that represent the previous state of the model. + * @param {Tensor} output.encoder_outputs The output of the encoder in a sequence-to-sequence model. + * @param {Tensor} [output.decoder_attentions] Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. + * @param {Tensor} [output.cross_attentions] Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. + */ + constructor({ + logits, + past_key_values, + encoder_outputs, + decoder_attentions = null, + cross_attentions = null, + }) { + super(); + this.logits = logits; + this.past_key_values = past_key_values; + this.encoder_outputs = encoder_outputs; + this.decoder_attentions = decoder_attentions; + this.cross_attentions = cross_attentions; + } } /** * Base class for outputs of sentence classification models. */ export class SequenceClassifierOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax). - */ - constructor({ logits }) { - super(); - this.logits = logits; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax). + */ + constructor({ logits }) { + super(); + this.logits = logits; + } } /** * Base class for outputs of token classification models. */ export class TokenClassifierOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification scores (before SoftMax). - */ - constructor({ logits }) { - super(); - this.logits = logits; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification scores (before SoftMax). + */ + constructor({ logits }) { + super(); + this.logits = logits; + } } /** * Base class for masked language models outputs. */ export class MaskedLMOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - */ - constructor({ logits }) { - super(); - this.logits = logits; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + */ + constructor({ logits }) { + super(); + this.logits = logits; + } } /** * Base class for outputs of question answering models. */ export class QuestionAnsweringModelOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.start_logits Span-start scores (before SoftMax). - * @param {Tensor} output.end_logits Span-end scores (before SoftMax). - */ - constructor({ start_logits, end_logits }) { - super(); - this.start_logits = start_logits; - this.end_logits = end_logits; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.start_logits Span-start scores (before SoftMax). + * @param {Tensor} output.end_logits Span-end scores (before SoftMax). + */ + constructor({ start_logits, end_logits }) { + super(); + this.start_logits = start_logits; + this.end_logits = end_logits; + } } - /** * Base class for causal language model (or autoregressive) outputs. */ export class CausalLMOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). - */ - constructor({ logits }) { - super(); - this.logits = logits; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). + */ + constructor({ logits }) { + super(); + this.logits = logits; + } } /** * Base class for causal language model (or autoregressive) outputs. */ export class CausalLMOutputWithPast extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). - * @param {Tensor} output.past_key_values Contains pre-computed hidden-states (key and values in the self-attention blocks) - * that can be used (see `past_key_values` input) to speed up sequential decoding. - */ - constructor({ logits, past_key_values }) { - super(); - this.logits = logits; - this.past_key_values = past_key_values; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). + * @param {Tensor} output.past_key_values Contains pre-computed hidden-states (key and values in the self-attention blocks) + * that can be used (see `past_key_values` input) to speed up sequential decoding. + */ + constructor({ logits, past_key_values }) { + super(); + this.logits = logits; + this.past_key_values = past_key_values; + } } export class ImageMattingOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.alphas Estimated alpha values, of shape `(batch_size, num_channels, height, width)`. - */ - constructor({ alphas }) { - super(); - this.alphas = alphas; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.alphas Estimated alpha values, of shape `(batch_size, num_channels, height, width)`. + */ + constructor({ alphas }) { + super(); + this.alphas = alphas; + } } /** * Describes the outputs for the VITS model. */ export class VitsModelOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.waveform The final audio waveform predicted by the model, of shape `(batch_size, sequence_length)`. - * @param {Tensor} output.spectrogram The log-mel spectrogram predicted at the output of the flow model. - * This spectrogram is passed to the Hi-Fi GAN decoder model to obtain the final audio waveform. - */ - constructor({ waveform, spectrogram }) { - super(); - this.waveform = waveform; - this.spectrogram = spectrogram; - } + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.waveform The final audio waveform predicted by the model, of shape `(batch_size, sequence_length)`. + * @param {Tensor} output.spectrogram The log-mel spectrogram predicted at the output of the flow model. + * This spectrogram is passed to the Hi-Fi GAN decoder model to obtain the final audio waveform. + */ + constructor({ waveform, spectrogram }) { + super(); + this.waveform = waveform; + this.spectrogram = spectrogram; + } } diff --git a/core/vendor/modules/@xenova/transformers/src/pipelines.js b/core/vendor/modules/@xenova/transformers/src/pipelines.js index 0c388e282..fd06deece 100644 --- a/core/vendor/modules/@xenova/transformers/src/pipelines.js +++ b/core/vendor/modules/@xenova/transformers/src/pipelines.js @@ -1,75 +1,51 @@ /** * @file Pipelines provide a high-level, easy to use, API for running machine learning models. - * + * * **Example:** Instantiate pipeline using the `pipeline` function. * ```javascript * import { pipeline } from '@xenova/transformers'; - * + * * const classifier = await pipeline('sentiment-analysis'); * const output = await classifier('I love transformers!'); * // [{'label': 'POSITIVE', 'score': 0.999817686}] * ``` - * + * * @module pipelines */ import { - AutoModel, - AutoModelForAudioClassification, - AutoModelForCTC, - AutoModelForCausalLM, - AutoModelForDepthEstimation, - AutoModelForDocumentQuestionAnswering, - AutoModelForImageClassification, - AutoModelForImageSegmentation, - AutoModelForImageToImage, - AutoModelForMaskedLM, - AutoModelForObjectDetection, - AutoModelForQuestionAnswering, - AutoModelForSemanticSegmentation, - AutoModelForSeq2SeqLM, - AutoModelForSequenceClassification, - AutoModelForSpeechSeq2Seq, - AutoModelForTextToSpectrogram, - AutoModelForTextToWaveform, - AutoModelForTokenClassification, - AutoModelForVision2Seq, - AutoModelForZeroShotObjectDetection, - PreTrainedModel, -} from './models.js'; -import { - AutoProcessor, - Processor -} from './processors.js'; -import { - AutoTokenizer, - PreTrainedTokenizer, -} from './tokenizers.js'; + AutoModel, + AutoModelForAudioClassification, + AutoModelForCTC, + AutoModelForCausalLM, + AutoModelForDepthEstimation, + AutoModelForDocumentQuestionAnswering, + AutoModelForImageClassification, + AutoModelForImageSegmentation, + AutoModelForImageToImage, + AutoModelForMaskedLM, + AutoModelForObjectDetection, + AutoModelForQuestionAnswering, + AutoModelForSemanticSegmentation, + AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification, + AutoModelForSpeechSeq2Seq, + AutoModelForTextToSpectrogram, + AutoModelForTextToWaveform, + AutoModelForTokenClassification, + AutoModelForVision2Seq, + AutoModelForZeroShotObjectDetection, + PreTrainedModel, +} from "./models.js"; +import { AutoProcessor, Processor } from "./processors.js"; +import { AutoTokenizer, PreTrainedTokenizer } from "./tokenizers.js"; - -import { - read_audio -} from './utils/audio.js'; -import { - Callable, - dispatchCallback, - pop, - product, -} from './utils/core.js'; -import { - getTopItems, - max, - round, - softmax, -} from './utils/maths.js'; -import { - Tensor, - interpolate, - mean_pooling, -} from './utils/tensor.js'; +import { read_audio } from "./utils/audio.js"; +import { Callable, dispatchCallback, pop, product } from "./utils/core.js"; +import { getTopItems, max, round, softmax } from "./utils/maths.js"; +import { Tensor, interpolate, mean_pooling } from "./utils/tensor.js"; // import { RawImage } from './utils/image.js'; - /** * @typedef {string | RawImage | URL} ImageInput * @typedef {ImageInput|ImageInput[]} ImagePipelineInputs @@ -82,13 +58,13 @@ import { * @private */ async function prepareImages(images) { - if (!Array.isArray(images)) { - images = [images]; - } + if (!Array.isArray(images)) { + images = [images]; + } - // Possibly convert any non-images to images - // return await Promise.all(images.map(x => RawImage.read(x))); - return Promise.resolve([]); + // Possibly convert any non-images to images + // return await Promise.all(images.map(x => RawImage.read(x))); + return Promise.resolve([]); } /** @@ -104,18 +80,20 @@ async function prepareImages(images) { * @private */ async function prepareAudios(audios, sampling_rate) { - if (!Array.isArray(audios)) { - audios = [audios]; - } + if (!Array.isArray(audios)) { + audios = [audios]; + } - return await Promise.all(audios.map(x => { - if (typeof x === 'string' || x instanceof URL) { - return read_audio(x, sampling_rate); - } else if (x instanceof Float64Array) { - return new Float32Array(x); - } - return x; - })); + return await Promise.all( + audios.map((x) => { + if (typeof x === "string" || x instanceof URL) { + return read_audio(x, sampling_rate); + } else if (x instanceof Float64Array) { + return new Float32Array(x); + } + return x; + }), + ); } /** @@ -134,19 +112,18 @@ async function prepareAudios(audios, sampling_rate) { * @private */ function get_bounding_box(box, asInteger) { - if (asInteger) { - box = box.map(x => x | 0); - } - const [xmin, ymin, xmax, ymax] = box; + if (asInteger) { + box = box.map((x) => x | 0); + } + const [xmin, ymin, xmax, ymax] = box; - return { xmin, ymin, xmax, ymax }; + return { xmin, ymin, xmax, ymax }; } - /** * @callback DisposeType Disposes the item. * @returns {Promise} A promise that resolves when the item has been disposed. - * + * * @typedef {Object} Disposable * @property {DisposeType} dispose A promise that resolves when the pipeline has been disposed. */ @@ -157,26 +134,26 @@ function get_bounding_box(box, asInteger) { * @extends Callable */ export class Pipeline extends Callable { - /** - * Create a new Pipeline. - * @param {Object} options An object containing the following properties: - * @param {string} [options.task] The task of the pipeline. Useful for specifying subtasks. - * @param {PreTrainedModel} [options.model] The model used by the pipeline. - * @param {PreTrainedTokenizer} [options.tokenizer=null] The tokenizer used by the pipeline (if any). - * @param {Processor} [options.processor=null] The processor used by the pipeline (if any). - */ - constructor({ task, model, tokenizer = null, processor = null }) { - super(); - this.task = task; - this.model = model; - this.tokenizer = tokenizer; - this.processor = processor; - } + /** + * Create a new Pipeline. + * @param {Object} options An object containing the following properties: + * @param {string} [options.task] The task of the pipeline. Useful for specifying subtasks. + * @param {PreTrainedModel} [options.model] The model used by the pipeline. + * @param {PreTrainedTokenizer} [options.tokenizer=null] The tokenizer used by the pipeline (if any). + * @param {Processor} [options.processor=null] The processor used by the pipeline (if any). + */ + constructor({ task, model, tokenizer = null, processor = null }) { + super(); + this.task = task; + this.model = model; + this.tokenizer = tokenizer; + this.processor = processor; + } - /** @type {DisposeType} */ - async dispose() { - await this.model.dispose(); - } + /** @type {DisposeType} */ + async dispose() { + await this.model.dispose(); + } } /** @@ -184,7 +161,7 @@ export class Pipeline extends Callable { * @property {string} task The task of the pipeline. Useful for specifying subtasks. * @property {PreTrainedModel} model The model used by the pipeline. * @property {PreTrainedTokenizer} tokenizer The tokenizer used by the pipeline. - * + * * @typedef {ModelTokenizerConstructorArgs} TextPipelineConstructorArgs An object used to instantiate a text-based pipeline. */ @@ -193,19 +170,18 @@ export class Pipeline extends Callable { * @property {string} task The task of the pipeline. Useful for specifying subtasks. * @property {PreTrainedModel} model The model used by the pipeline. * @property {Processor} processor The processor used by the pipeline. - * + * * @typedef {ModelProcessorConstructorArgs} AudioPipelineConstructorArgs An object used to instantiate an audio-based pipeline. * @typedef {ModelProcessorConstructorArgs} ImagePipelineConstructorArgs An object used to instantiate an image-based pipeline. */ - /** * @typedef {Object} ModelTokenizerProcessorConstructorArgs * @property {string} task The task of the pipeline. Useful for specifying subtasks. * @property {PreTrainedModel} model The model used by the pipeline. * @property {PreTrainedTokenizer} tokenizer The tokenizer used by the pipeline. * @property {Processor} processor The processor used by the pipeline. - * + * * @typedef {ModelTokenizerProcessorConstructorArgs} TextAudioPipelineConstructorArgs An object used to instantiate a text- and audio-based pipeline. * @typedef {ModelTokenizerProcessorConstructorArgs} TextImagePipelineConstructorArgs An object used to instantiate a text- and image-based pipeline. */ @@ -215,15 +191,15 @@ export class Pipeline extends Callable { * @property {string} label The label predicted. * @property {number} score The corresponding probability. * @typedef {TextClassificationSingle[]} TextClassificationOutput - * + * * @typedef {Object} TextClassificationPipelineOptions Parameters specific to text classification pipelines. * @property {number} [topk=1] The number of top predictions to be returned. - * + * * @callback TextClassificationPipelineCallback Classify the text(s) given as inputs. * @param {string|string[]} texts The input text(s) to be classified. * @param {TextClassificationPipelineOptions} [options] The options to use for text classification. * @returns {Promise} An array or object containing the predicted labels and scores. - * + * * @typedef {TextPipelineConstructorArgs & TextClassificationPipelineCallback & Disposable} TextClassificationPipelineType */ @@ -236,7 +212,7 @@ export class Pipeline extends Callable { * const output = await classifier('I love transformers!'); * // [{ label: 'POSITIVE', score: 0.999788761138916 }] * ``` - * + * * **Example:** Multilingual sentiment-analysis w/ `Xenova/bert-base-multilingual-uncased-sentiment` (and return top 5 classes). * ```javascript * const classifier = await pipeline('sentiment-analysis', 'Xenova/bert-base-multilingual-uncased-sentiment'); @@ -249,7 +225,7 @@ export class Pipeline extends Callable { * // { label: '2 stars', score: 0.0009423971059732139 } * // ] * ``` - * + * * **Example:** Toxic comment classification w/ `Xenova/toxic-bert` (and return all classes). * ```javascript * const classifier = await pipeline('text-classification', 'Xenova/toxic-bert'); @@ -264,56 +240,58 @@ export class Pipeline extends Callable { * // ] * ``` */ -export class TextClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TextClassificationPipelineType} */ (Pipeline)) { +export class TextClassificationPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => TextClassificationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new TextClassificationPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new TextClassificationPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); + /** @type {TextClassificationPipelineCallback} */ + async _call(texts, { topk = 1 } = {}) { + // Run tokenization + const model_inputs = this.tokenizer(texts, { + padding: true, + truncation: true, + }); + + // Run model + const outputs = await this.model(model_inputs); + + // TODO: Use softmax tensor function + const function_to_apply = + this.model.config.problem_type === "multi_label_classification" + ? (batch) => batch.sigmoid().data + : (batch) => softmax(batch.data); // single_label_classification (default) + + const id2label = this.model.config.id2label; + + const toReturn = []; + for (const batch of outputs.logits) { + const output = function_to_apply(batch); + const scores = getTopItems(output, topk); + + const vals = scores.map((x) => ({ + label: id2label[x[0]], + score: x[1], + })); + if (topk === 1) { + toReturn.push(...vals); + } else { + toReturn.push(vals); + } } - /** @type {TextClassificationPipelineCallback} */ - async _call(texts, { - topk = 1 - } = {}) { - - // Run tokenization - const model_inputs = this.tokenizer(texts, { - padding: true, - truncation: true, - }); - - // Run model - const outputs = await this.model(model_inputs) - - // TODO: Use softmax tensor function - const function_to_apply = - this.model.config.problem_type === 'multi_label_classification' - ? batch => batch.sigmoid().data - : batch => softmax(batch.data); // single_label_classification (default) - - const id2label = this.model.config.id2label; - - const toReturn = []; - for (const batch of outputs.logits) { - const output = function_to_apply(batch); - const scores = getTopItems(output, topk); - - const vals = scores.map(x => ({ - label: id2label[x[0]], - score: x[1], - })); - if (topk === 1) { - toReturn.push(...vals); - } else { - toReturn.push(vals); - } - } - - return Array.isArray(texts) || topk === 1 ? /** @type {TextClassificationOutput} */ (toReturn) : /** @type {TextClassificationOutput[]} */ (toReturn)[0]; - } + return Array.isArray(texts) || topk === 1 + ? /** @type {TextClassificationOutput} */ (toReturn) + : /** @type {TextClassificationOutput[]} */ (toReturn)[0]; + } } /** @@ -325,21 +303,21 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi * @property {number} [start] The index of the start of the corresponding entity in the sentence. * @property {number} [end] The index of the end of the corresponding entity in the sentence. * @typedef {TokenClassificationSingle[]} TokenClassificationOutput - * + * * @typedef {Object} TokenClassificationPipelineOptions Parameters specific to token classification pipelines. * @property {string[]} [ignore_labels] A list of labels to ignore. - * + * * @callback TokenClassificationPipelineCallback Classify each token of the text(s) given as inputs. * @param {string|string[]} texts One or several texts (or one list of texts) for token classification. * @param {TokenClassificationPipelineOptions} [options] The options to use for token classification. * @returns {Promise} The result. - * + * * @typedef {TextPipelineConstructorArgs & TokenClassificationPipelineCallback & Disposable} TokenClassificationPipelineType */ /** * Named Entity Recognition pipeline using any `ModelForTokenClassification`. - * + * * **Example:** Perform named entity recognition with `Xenova/bert-base-NER`. * ```javascript * const classifier = await pipeline('token-classification', 'Xenova/bert-base-NER'); @@ -349,7 +327,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi * // { entity: 'B-LOC', score: 0.9994474053382874, index: 9, word: 'London' } * // ] * ``` - * + * * **Example:** Perform named entity recognition with `Xenova/bert-base-NER` (and return all labels). * ```javascript * const classifier = await pipeline('token-classification', 'Xenova/bert-base-NER'); @@ -366,76 +344,80 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi * // ] * ``` */ -export class TokenClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TokenClassificationPipelineType} */ (Pipeline)) { +export class TokenClassificationPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => TokenClassificationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new TokenClassificationPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new TokenClassificationPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); - } + /** @type {TokenClassificationPipelineCallback} */ + async _call(texts, { ignore_labels = ["O"] } = {}) { + const isBatched = Array.isArray(texts); - /** @type {TokenClassificationPipelineCallback} */ - async _call(texts, { - ignore_labels = ['O'], - } = {}) { + // Run tokenization + const model_inputs = this.tokenizer(isBatched ? texts : [texts], { + padding: true, + truncation: true, + }); - const isBatched = Array.isArray(texts); + // Run model + const outputs = await this.model(model_inputs); - // Run tokenization - const model_inputs = this.tokenizer(isBatched ? texts : [texts], { - padding: true, - truncation: true, - }); + const logits = outputs.logits; + const id2label = this.model.config.id2label; - // Run model - const outputs = await this.model(model_inputs) + const toReturn = []; + for (let i = 0; i < logits.dims[0]; ++i) { + const ids = model_inputs.input_ids[i]; + const batch = logits[i]; - const logits = outputs.logits; - const id2label = this.model.config.id2label; + // List of tokens that aren't ignored + const tokens = []; + for (let j = 0; j < batch.dims[0]; ++j) { + const tokenData = batch[j]; + const topScoreIndex = max(tokenData.data)[1]; - const toReturn = []; - for (let i = 0; i < logits.dims[0]; ++i) { - const ids = model_inputs.input_ids[i]; - const batch = logits[i]; - - // List of tokens that aren't ignored - const tokens = []; - for (let j = 0; j < batch.dims[0]; ++j) { - const tokenData = batch[j]; - const topScoreIndex = max(tokenData.data)[1]; - - const entity = id2label ? id2label[topScoreIndex] : `LABEL_${topScoreIndex}`; - if (ignore_labels.includes(entity)) { - // We predicted a token that should be ignored. So, we skip it. - continue; - } - - // TODO add option to keep special tokens? - const word = this.tokenizer.decode([ids[j].item()], { skip_special_tokens: true }); - if (word === '') { - // Was a special token. So, we skip it. - continue; - } - - const scores = softmax(tokenData.data); - - tokens.push({ - entity: entity, - score: scores[topScoreIndex], - index: j, - word: word, - - // TODO: null for now, but will add - start: null, - end: null, - }); - } - toReturn.push(tokens); + const entity = id2label + ? id2label[topScoreIndex] + : `LABEL_${topScoreIndex}`; + if (ignore_labels.includes(entity)) { + // We predicted a token that should be ignored. So, we skip it. + continue; } - return isBatched ? toReturn : toReturn[0]; + + // TODO add option to keep special tokens? + const word = this.tokenizer.decode([ids[j].item()], { + skip_special_tokens: true, + }); + if (word === "") { + // Was a special token. So, we skip it. + continue; + } + + const scores = softmax(tokenData.data); + + tokens.push({ + entity: entity, + score: scores[topScoreIndex], + index: j, + word: word, + + // TODO: null for now, but will add + start: null, + end: null, + }); + } + toReturn.push(tokens); } + return isBatched ? toReturn : toReturn[0]; + } } /** @@ -444,22 +426,22 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP * @property {number} [start] The character start index of the answer (in the tokenized version of the input). * @property {number} [end] The character end index of the answer (in the tokenized version of the input). * @property {string} answer The answer to the question. - * + * * @typedef {Object} QuestionAnsweringPipelineOptions Parameters specific to question answering pipelines. * @property {number} [topk=1] The number of top answer predictions to be returned. - * + * * @callback QuestionAnsweringPipelineCallback Answer the question(s) given as inputs by using the context(s). * @param {string|string[]} question One or several question(s) (must be used in conjunction with the `context` argument). * @param {string|string[]} context One or several context(s) associated with the question(s) (must be used in conjunction with the `question` argument). * @param {QuestionAnsweringPipelineOptions} [options] The options to use for question answering. * @returns {Promise} An array or object containing the predicted answers and scores. - * + * * @typedef {TextPipelineConstructorArgs & QuestionAnsweringPipelineCallback & Disposable} QuestionAnsweringPipelineType */ /** * Question Answering pipeline using any `ModelForQuestionAnswering`. - * + * * **Example:** Run question answering with `Xenova/distilbert-base-uncased-distilled-squad`. * ```javascript * const answerer = await pipeline('question-answering', 'Xenova/distilbert-base-uncased-distilled-squad'); @@ -472,70 +454,70 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP * // } * ``` */ -export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => QuestionAnsweringPipelineType} */ (Pipeline)) { +export class QuestionAnsweringPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => QuestionAnsweringPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new QuestionAnsweringPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new QuestionAnsweringPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); - } + /** @type {QuestionAnsweringPipelineCallback} */ + async _call(question, context, { topk = 1 } = {}) { + // Run tokenization + const inputs = this.tokenizer(question, { + text_pair: context, + padding: true, + truncation: true, + }); - /** @type {QuestionAnsweringPipelineCallback} */ - async _call(question, context, { - topk = 1 - } = {}) { + const output = await this.model(inputs); - // Run tokenization - const inputs = this.tokenizer(question, { - text_pair: context, - padding: true, - truncation: true, + /** @type {QuestionAnsweringOutput[]} */ + const toReturn = []; + for (let j = 0; j < output.start_logits.dims[0]; ++j) { + const ids = inputs.input_ids[j]; + const sepIndex = ids.indexOf(this.tokenizer.sep_token_id); + + const s1 = Array.from(softmax(output.start_logits[j].data)) + .map((x, i) => [x, i]) + .filter((x) => x[1] > sepIndex); + const e1 = Array.from(softmax(output.end_logits[j].data)) + .map((x, i) => [x, i]) + .filter((x) => x[1] > sepIndex); + + const options = product(s1, e1) + .filter((x) => x[0][1] <= x[1][1]) + .map((x) => [x[0][1], x[1][1], x[0][0] * x[1][0]]) + .sort((a, b) => b[2] - a[2]); + + for (let k = 0; k < Math.min(options.length, topk); ++k) { + const [start, end, score] = options[k]; + + const answer_tokens = [...ids].slice(start, end + 1); + + const answer = this.tokenizer.decode(answer_tokens, { + skip_special_tokens: true, }); - const output = await this.model(inputs); - - /** @type {QuestionAnsweringOutput[]} */ - const toReturn = []; - for (let j = 0; j < output.start_logits.dims[0]; ++j) { - const ids = inputs.input_ids[j]; - const sepIndex = ids.indexOf(this.tokenizer.sep_token_id); - - const s1 = Array.from(softmax(output.start_logits[j].data)) - .map((x, i) => [x, i]) - .filter(x => x[1] > sepIndex); - const e1 = Array.from(softmax(output.end_logits[j].data)) - .map((x, i) => [x, i]) - .filter(x => x[1] > sepIndex); - - const options = product(s1, e1) - .filter(x => x[0][1] <= x[1][1]) - .map(x => [x[0][1], x[1][1], x[0][0] * x[1][0]]) - .sort((a, b) => b[2] - a[2]); - - for (let k = 0; k < Math.min(options.length, topk); ++k) { - const [start, end, score] = options[k]; - - const answer_tokens = [...ids].slice(start, end + 1) - - const answer = this.tokenizer.decode(answer_tokens, { - skip_special_tokens: true, - }); - - // TODO add start and end? - // NOTE: HF returns character index - toReturn.push({ - answer, score - }); - } - } - - // Mimic HF's return type based on topk - return (topk === 1) ? toReturn[0] : toReturn; + // TODO add start and end? + // NOTE: HF returns character index + toReturn.push({ + answer, + score, + }); + } } -} + // Mimic HF's return type based on topk + return topk === 1 ? toReturn[0] : toReturn; + } +} /** * @typedef {Object} FillMaskSingle @@ -544,10 +526,10 @@ export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPip * @property {number} token The predicted token id (to replace the masked one). * @property {string} token_str The predicted token (to replace the masked one). * @typedef {FillMaskSingle[]} FillMaskOutput - * + * * @typedef {Object} FillMaskPipelineOptions Parameters specific to fill mask pipelines. * @property {number} [topk=5] When passed, overrides the number of predictions to return. - * + * * @callback FillMaskPipelineCallback Fill the masked token in the text(s) given as inputs. * @param {string|string[]} texts One or several texts (or one list of prompts) with masked tokens. * @param {FillMaskPipelineOptions} [options] The options to use for masked language modelling. @@ -555,13 +537,13 @@ export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPip * and the sequence with the predicted token filled in, or an array of such arrays (one for each input text). * If only one input text is given, the output will be an array of objects. * @throws {Error} When the mask token is not found in the input text. - * + * * @typedef {TextPipelineConstructorArgs & FillMaskPipelineCallback & Disposable} FillMaskPipelineType */ /** * Masked language modeling prediction pipeline using any `ModelWithLMHead`. - * + * * **Example:** Perform masked language modelling (a.k.a. "fill-mask") with `Xenova/bert-base-uncased`. * ```javascript * const unmasker = await pipeline('fill-mask', 'Xenova/bert-base-cased'); @@ -574,7 +556,7 @@ export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPip * // { token_str: 'life', score: 0.01859794743359089, token: 1297, sequence: 'The goal of life is life.' } * // ] * ``` - * + * * **Example:** Perform masked language modelling (a.k.a. "fill-mask") with `Xenova/bert-base-cased` (and return top result). * ```javascript * const unmasker = await pipeline('fill-mask', 'Xenova/bert-base-cased'); @@ -582,77 +564,82 @@ export class QuestionAnsweringPipeline extends (/** @type {new (options: TextPip * // [{ token_str: 'spiral', score: 0.6299987435340881, token: 14061, sequence: 'The Milky Way is a spiral galaxy.' }] * ``` */ -export class FillMaskPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => FillMaskPipelineType} */ (Pipeline)) { +export class FillMaskPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => FillMaskPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new FillMaskPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new FillMaskPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); - } - - /** @type {FillMaskPipelineCallback} */ - async _call(texts, { - topk = 5 - } = {}) { - - // Run tokenization - const model_inputs = this.tokenizer(texts, { - padding: true, - truncation: true, - }); - - // Run model - const outputs = await this.model(model_inputs) - - const toReturn = []; - - for (let i = 0; i < model_inputs.input_ids.dims[0]; ++i) { - const ids = model_inputs.input_ids[i]; - const mask_token_index = ids.indexOf(this.tokenizer.mask_token_id) - - if (mask_token_index === -1) { - throw Error(`Mask token (${this.tokenizer.mask_token}) not found in text.`) - } - const logits = outputs.logits[i]; - const itemLogits = logits[mask_token_index]; - - const scores = getTopItems(softmax(itemLogits.data), topk); - - toReturn.push(scores.map(x => { - const sequence = [...ids]; - sequence[mask_token_index] = x[0]; - - return { - score: x[1], - token: x[0], - token_str: this.tokenizer.model.vocab[x[0]], - sequence: this.tokenizer.decode(sequence, { skip_special_tokens: true }), - } - })); - } - return Array.isArray(texts) ? toReturn : toReturn[0]; + /** @type {FillMaskPipelineCallback} */ + async _call(texts, { topk = 5 } = {}) { + // Run tokenization + const model_inputs = this.tokenizer(texts, { + padding: true, + truncation: true, + }); + + // Run model + const outputs = await this.model(model_inputs); + + const toReturn = []; + + for (let i = 0; i < model_inputs.input_ids.dims[0]; ++i) { + const ids = model_inputs.input_ids[i]; + const mask_token_index = ids.indexOf(this.tokenizer.mask_token_id); + + if (mask_token_index === -1) { + throw Error( + `Mask token (${this.tokenizer.mask_token}) not found in text.`, + ); + } + const logits = outputs.logits[i]; + const itemLogits = logits[mask_token_index]; + + const scores = getTopItems(softmax(itemLogits.data), topk); + + toReturn.push( + scores.map((x) => { + const sequence = [...ids]; + sequence[mask_token_index] = x[0]; + + return { + score: x[1], + token: x[0], + token_str: this.tokenizer.model.vocab[x[0]], + sequence: this.tokenizer.decode(sequence, { + skip_special_tokens: true, + }), + }; + }), + ); } + return Array.isArray(texts) ? toReturn : toReturn[0]; + } } - /** * @typedef {Object} Text2TextGenerationSingle * @property {string} generated_text The generated text. * @typedef {Text2TextGenerationSingle[]} Text2TextGenerationOutput - * + * * @callback Text2TextGenerationPipelineCallback Generate the output text(s) using text(s) given as inputs. * @param {string|string[]} texts Input text for the encoder. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} - * + * * @typedef {TextPipelineConstructorArgs & Text2TextGenerationPipelineCallback & Disposable} Text2TextGenerationPipelineType */ /** * Text2TextGenerationPipeline class for generating text using a model that performs text-to-text generation tasks. - * + * * **Example:** Text-to-text generation w/ `Xenova/LaMini-Flan-T5-783M`. * ```javascript * const generator = await pipeline('text2text-generation', 'Xenova/LaMini-Flan-T5-783M'); @@ -662,82 +649,95 @@ export class FillMaskPipeline extends (/** @type {new (options: TextPipelineCons * // [{ generated_text: "To become more healthy, you can: 1. Eat a balanced diet with plenty of fruits, vegetables, whole grains, lean proteins, and healthy fats. 2. Stay hydrated by drinking plenty of water. 3. Get enough sleep and manage stress levels. 4. Avoid smoking and excessive alcohol consumption. 5. Regularly exercise and maintain a healthy weight. 6. Practice good hygiene and sanitation. 7. Seek medical attention if you experience any health issues." }] * ``` */ -export class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => Text2TextGenerationPipelineType} */ (Pipeline)) { - /** @type {'generated_text'} */ - _key = 'generated_text'; +export class Text2TextGenerationPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => Text2TextGenerationPipelineType} */ ( + Pipeline + ) +{ + /** @type {'generated_text'} */ + _key = "generated_text"; - /** - * Create a new Text2TextGenerationPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); + /** + * Create a new Text2TextGenerationPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } + + /** @type {Text2TextGenerationPipelineCallback} */ + async _call(texts, generate_kwargs = {}) { + if (!Array.isArray(texts)) { + texts = [texts]; } - /** @type {Text2TextGenerationPipelineCallback} */ - async _call(texts, generate_kwargs = {}) { - if (!Array.isArray(texts)) { - texts = [texts]; - } - - - // Add global prefix, if present - if (this.model.config.prefix) { - texts = texts.map(x => this.model.config.prefix + x) - } - - // Handle task specific params: - const task_specific_params = this.model.config.task_specific_params - if (task_specific_params && task_specific_params[this.task]) { - // Add prefixes, if present - if (task_specific_params[this.task].prefix) { - texts = texts.map(x => task_specific_params[this.task].prefix + x) - } - - // TODO update generation config - } - - const tokenizer = this.tokenizer; - const tokenizer_options = { - padding: true, - truncation: true, - } - let input_ids; - if (this instanceof TranslationPipeline && '_build_translation_inputs' in tokenizer) { - // TODO: move to Translation pipeline? - // Currently put here to avoid code duplication - // @ts-ignore - input_ids = tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs).input_ids; - - } else { - input_ids = tokenizer(texts, tokenizer_options).input_ids; - } - - const outputTokenIds = await this.model.generate(input_ids, generate_kwargs); - - return tokenizer.batch_decode(outputTokenIds, { - skip_special_tokens: true, - }).map(text => ({ [this._key]: text })); + // Add global prefix, if present + if (this.model.config.prefix) { + texts = texts.map((x) => this.model.config.prefix + x); } + + // Handle task specific params: + const task_specific_params = this.model.config.task_specific_params; + if (task_specific_params && task_specific_params[this.task]) { + // Add prefixes, if present + if (task_specific_params[this.task].prefix) { + texts = texts.map((x) => task_specific_params[this.task].prefix + x); + } + + // TODO update generation config + } + + const tokenizer = this.tokenizer; + const tokenizer_options = { + padding: true, + truncation: true, + }; + let input_ids; + if ( + this instanceof TranslationPipeline && + "_build_translation_inputs" in tokenizer + ) { + // TODO: move to Translation pipeline? + // Currently put here to avoid code duplication + // @ts-ignore + input_ids = tokenizer._build_translation_inputs( + texts, + tokenizer_options, + generate_kwargs, + ).input_ids; + } else { + input_ids = tokenizer(texts, tokenizer_options).input_ids; + } + + const outputTokenIds = await this.model.generate( + input_ids, + generate_kwargs, + ); + + return tokenizer + .batch_decode(outputTokenIds, { + skip_special_tokens: true, + }) + .map((text) => ({ [this._key]: text })); + } } - /** * @typedef {Object} SummarizationSingle * @property {string} summary_text The summary text. * @typedef {SummarizationSingle[]} SummarizationOutput - * + * * @callback SummarizationPipelineCallback Summarize the text(s) given as inputs. * @param {string|string[]} texts One or several articles (or one list of articles) to summarize. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} - * + * * @typedef {TextPipelineConstructorArgs & SummarizationPipelineCallback & Disposable} SummarizationPipelineType */ /** * A pipeline for summarization tasks, inheriting from Text2TextGenerationPipeline. - * + * * **Example:** Summarization w/ `Xenova/distilbart-cnn-6-6`. * ```javascript * const generator = await pipeline('summarization', 'Xenova/distilbart-cnn-6-6'); @@ -755,41 +755,44 @@ export class Text2TextGenerationPipeline extends (/** @type {new (options: TextP * // [{ summary_text: ' The Eiffel Tower is about the same height as an 81-storey building and the tallest structure in Paris. It is the second tallest free-standing structure in France after the Millau Viaduct.' }] * ``` */ -export class SummarizationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => SummarizationPipelineType} */ (/** @type {any} */ (Text2TextGenerationPipeline))) { - /** @type {'summary_text'} */ - _key = 'summary_text'; +export class SummarizationPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => SummarizationPipelineType} */ ( + /** @type {any} */ (Text2TextGenerationPipeline) + ) +{ + /** @type {'summary_text'} */ + _key = "summary_text"; - /** - * Create a new SummarizationPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); - } + /** + * Create a new SummarizationPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } } - /** * @typedef {Object} TranslationSingle * @property {string} translation_text The translated text. * @typedef {TranslationSingle[]} TranslationOutput - * + * * @callback TranslationPipelineCallback Translate the text(s) given as inputs. * @param {string|string[]} texts Texts to be translated. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} - * + * * @typedef {TextPipelineConstructorArgs & TranslationPipelineCallback & Disposable} TranslationPipelineType */ /** * Translates text from one language to another. - * + * * **Example:** Multilingual translation w/ `Xenova/nllb-200-distilled-600M`. - * + * * See [here](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) * for the full list of languages and their corresponding codes. - * + * * ```javascript * const translator = await pipeline('translation', 'Xenova/nllb-200-distilled-600M'); * const output = await translator('เคœเฅ€เคตเคจ เคเค• เคšเฅ‰เค•เคฒเฅ‡เคŸ เคฌเฅ‰เค•เฅเคธ เค•เฅ€ เคคเคฐเคน เคนเฅˆเฅค', { @@ -798,12 +801,12 @@ export class SummarizationPipeline extends (/** @type {new (options: TextPipelin * }); * // [{ translation_text: 'La vie est comme une boรฎte ร  chocolat.' }] * ``` - * + * * **Example:** Multilingual translation w/ `Xenova/m2m100_418M`. - * + * * See [here](https://huggingface.co/facebook/m2m100_418M#languages-covered) * for the full list of languages and their corresponding codes. - * + * * ```javascript * const translator = await pipeline('translation', 'Xenova/m2m100_418M'); * const output = await translator('็”Ÿๆดปๅฐฑๅƒไธ€็›’ๅทงๅ…‹ๅŠ›ใ€‚', { @@ -812,12 +815,12 @@ export class SummarizationPipeline extends (/** @type {new (options: TextPipelin * }); * // [{ translation_text: 'Life is like a box of chocolate.' }] * ``` - * + * * **Example:** Multilingual translation w/ `Xenova/mbart-large-50-many-to-many-mmt`. - * + * * See [here](https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt#languages-covered) * for the full list of languages and their corresponding codes. - * + * * ```javascript * const translator = await pipeline('translation', 'Xenova/mbart-large-50-many-to-many-mmt'); * const output = await translator('เคธเค‚เคฏเฅเค•เฅเคค เคฐเคพเคทเฅเคŸเฅเคฐ เค•เฅ‡ เคชเฅเคฐเคฎเฅเค– เค•เคพ เค•เคนเคจเคพ เคนเฅˆ เค•เคฟ เคธเฅ€เคฐเคฟเคฏเคพ เคฎเฅ‡เค‚ เค•เฅ‹เคˆ เคธเฅˆเคจเฅเคฏ เคธเคฎเคพเคงเคพเคจ เคจเคนเฅ€เค‚ เคนเฅˆ', { @@ -827,34 +830,37 @@ export class SummarizationPipeline extends (/** @type {new (options: TextPipelin * // [{ translation_text: 'Le chef des Nations affirme qu 'il n 'y a military solution in Syria.' }] * ``` */ -export class TranslationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TranslationPipelineType} */ (/** @type {any} */ (Text2TextGenerationPipeline))) { - /** @type {'translation_text'} */ - _key = 'translation_text'; +export class TranslationPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => TranslationPipelineType} */ ( + /** @type {any} */ (Text2TextGenerationPipeline) + ) +{ + /** @type {'translation_text'} */ + _key = "translation_text"; - /** - * Create a new TranslationPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); - } + /** + * Create a new TranslationPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } } - /** * @typedef {Object} TextGenerationSingle * @property {string} generated_text The generated text. * @typedef {TextGenerationSingle[]} TextGenerationOutput - * + * * @typedef {Object} TextGenerationSpecificParams Parameters specific to text-generation pipelines. * @property {boolean} [add_special_tokens] Whether or not to add special tokens when tokenizing the sequences. * @typedef {import('./utils/generation.js').GenerationConfigType & TextGenerationSpecificParams} TextGenerationConfig - * + * * @callback TextGenerationPipelineCallback Complete the prompt(s) given as inputs. * @param {string|string[]} texts One or several prompts (or one list of prompts) to complete. * @param {TextGenerationConfig} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} An array or object containing the generated texts. - * + * * @typedef {TextPipelineConstructorArgs & TextGenerationPipelineCallback & Disposable} TextGenerationPipelineType */ @@ -862,7 +868,7 @@ export class TranslationPipeline extends (/** @type {new (options: TextPipelineC * Language generation pipeline using any `ModelWithLMHead` or `ModelForCausalLM`. * This pipeline predicts the words that will follow a specified text prompt. * NOTE: For the full list of generation parameters, see [`GenerationConfig`](./utils/generation#module_utils/generation.GenerationConfig). - * + * * **Example:** Text generation with `Xenova/distilgpt2` (default settings). * ```javascript * const generator = await pipeline('text-generation', 'Xenova/distilgpt2'); @@ -870,7 +876,7 @@ export class TranslationPipeline extends (/** @type {new (options: TextPipelineC * const output = await generator(text); * // [{ generated_text: "I enjoy walking with my cute dog, and I love to play with the other dogs." }] * ``` - * + * * **Example:** Text generation with `Xenova/distilgpt2` (custom settings). * ```javascript * const generator = await pipeline('text-generation', 'Xenova/distilgpt2'); @@ -889,7 +895,7 @@ export class TranslationPipeline extends (/** @type {new (options: TextPipelineC * // "generated_text": "Once upon a time, there was an abundance of information about the most important and influential" * // }] * ``` - * + * * **Example:** Run code generation with `Xenova/codegen-350M-mono`. * ```javascript * const generator = await pipeline('text-generation', 'Xenova/codegen-350M-mono'); @@ -908,53 +914,60 @@ export class TranslationPipeline extends (/** @type {new (options: TextPipelineC * // }] * ``` */ -export class TextGenerationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => TextGenerationPipelineType} */ (Pipeline)) { +export class TextGenerationPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => TextGenerationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new TextGenerationPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new TextGenerationPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); + /** @type {TextGenerationPipelineCallback} */ + async _call(texts, generate_kwargs = {}) { + const isBatched = Array.isArray(texts); + if (!isBatched) { + texts = [/** @type {string}*/ (texts)]; } - /** @type {TextGenerationPipelineCallback} */ - async _call(texts, generate_kwargs = {}) { + // By default, do not add special tokens + const add_special_tokens = generate_kwargs.add_special_tokens ?? false; - const isBatched = Array.isArray(texts); - if (!isBatched) { - texts = [/** @type {string}*/ (texts)]; - } + this.tokenizer.padding_side = "left"; + const { input_ids, attention_mask } = this.tokenizer(texts, { + add_special_tokens, + padding: true, + truncation: true, + }); - // By default, do not add special tokens - const add_special_tokens = generate_kwargs.add_special_tokens ?? false; + const outputTokenIds = await this.model.generate( + input_ids, + generate_kwargs, + null, + { + inputs_attention_mask: attention_mask, + }, + ); - this.tokenizer.padding_side = 'left'; - const { input_ids, attention_mask } = this.tokenizer(texts, { - add_special_tokens, - padding: true, - truncation: true, - }); + const decoded = this.tokenizer.batch_decode(outputTokenIds, { + skip_special_tokens: true, + }); - const outputTokenIds = await this.model.generate(input_ids, generate_kwargs, null, { - inputs_attention_mask: attention_mask - }); + /** @type {TextGenerationOutput[]} */ + const toReturn = Array.from({ length: texts.length }, (_) => []); + for (let i = 0; i < decoded.length; ++i) { + const textIndex = Math.floor((i / outputTokenIds.length) * texts.length); - const decoded = this.tokenizer.batch_decode(outputTokenIds, { - skip_special_tokens: true, - }); - - /** @type {TextGenerationOutput[]} */ - const toReturn = Array.from({ length: texts.length }, _ => []); - for (let i = 0; i < decoded.length; ++i) { - const textIndex = Math.floor(i / outputTokenIds.length * texts.length); - - toReturn[textIndex].push({ - generated_text: decoded[i] - }); - } - return (!isBatched && toReturn.length === 1) ? toReturn[0] : toReturn; + toReturn[textIndex].push({ + generated_text: decoded[i], + }); } + return !isBatched && toReturn.length === 1 ? toReturn[0] : toReturn; + } } /** @@ -962,7 +975,7 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli * @property {string} sequence The sequence for which this is the output. * @property {string[]} labels The labels sorted by order of likelihood. * @property {number[]} scores The probabilities for each of the labels. - * + * * @typedef {Object} ZeroShotClassificationPipelineOptions Parameters specific to zero-shot classification pipelines. * @property {string} [hypothesis_template="This example is {}."] The template used to turn each * candidate label into an NLI-style hypothesis. The candidate label will replace the {} placeholder. @@ -970,14 +983,14 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli * If `false`, the scores are normalized such that the sum of the label likelihoods for each sequence * is 1. If `true`, the labels are considered independent and probabilities are normalized for each * candidate by doing a softmax of the entailment score vs. the contradiction score. - * + * * @callback ZeroShotClassificationPipelineCallback Classify the sequence(s) given as inputs. * @param {string|string[]} texts The sequence(s) to classify, will be truncated if the model input is too large. * @param {string|string[]} candidate_labels The set of possible class labels to classify each sequence into. * Can be a single label, a string of comma-separated labels, or a list of labels. * @param {ZeroShotClassificationPipelineOptions} [options] The options to use for zero-shot classification. * @returns {Promise} An array or object containing the predicted labels and scores. - * + * * @typedef {TextPipelineConstructorArgs & ZeroShotClassificationPipelineCallback & Disposable} ZeroShotClassificationPipelineType */ @@ -986,7 +999,7 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli * trained on NLI (natural language inference) tasks. Equivalent of `text-classification` * pipelines, but these models don't require a hardcoded number of potential classes, they * can be chosen at runtime. It usually means it's slower but it is **much** more flexible. - * + * * **Example:** Zero shot classification with `Xenova/mobilebert-uncased-mnli`. * ```javascript * const classifier = await pipeline('zero-shot-classification', 'Xenova/mobilebert-uncased-mnli'); @@ -999,7 +1012,7 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli * // scores: [ 0.5562091040482018, 0.1843621307860853, 0.13942646639336376, 0.12000229877234923 ] * // } * ``` - * + * * **Example:** Zero shot classification with `Xenova/nli-deberta-v3-xsmall` (multi-label). * ```javascript * const classifier = await pipeline('zero-shot-classification', 'Xenova/nli-deberta-v3-xsmall'); @@ -1013,118 +1026,127 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli * // } * ``` */ -export class ZeroShotClassificationPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => ZeroShotClassificationPipelineType} */ (Pipeline)) { - /** - * Create a new ZeroShotClassificationPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); +export class ZeroShotClassificationPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => ZeroShotClassificationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new ZeroShotClassificationPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); - // Use model config to get label2id mapping - this.label2id = Object.fromEntries( - Object.entries((/** @type {any} */(this).model).config.label2id).map( - ([k, v]) => [k.toLowerCase(), v] - ) - ); + // Use model config to get label2id mapping + this.label2id = Object.fromEntries( + Object.entries(/** @type {any} */ (this).model.config.label2id).map( + ([k, v]) => [k.toLowerCase(), v], + ), + ); - this.entailment_id = this.label2id['entailment']; - if (this.entailment_id === undefined) { - console.warn("Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."); - this.entailment_id = 2; - } - - this.contradiction_id = this.label2id['contradiction'] ?? this.label2id['not_entailment']; - if (this.contradiction_id === undefined) { - console.warn("Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."); - this.contradiction_id = 0; - } + this.entailment_id = this.label2id["entailment"]; + if (this.entailment_id === undefined) { + console.warn( + "Could not find 'entailment' in label2id mapping. Using 2 as entailment_id.", + ); + this.entailment_id = 2; } - /** @type {ZeroShotClassificationPipelineCallback} */ - async _call(texts, candidate_labels, { - hypothesis_template = "This example is {}.", - multi_label = false, - } = {}) { - - const isBatched = Array.isArray(texts); - if (!isBatched) { - texts = [/** @type {string} */ (texts)]; - } - if (!Array.isArray(candidate_labels)) { - candidate_labels = [candidate_labels]; - } - - // Insert labels into hypothesis template - const hypotheses = candidate_labels.map( - x => hypothesis_template.replace('{}', x) - ); - - // How to perform the softmax over the logits: - // - true: softmax over the entailment vs. contradiction dim for each label independently - // - false: softmax the "entailment" logits over all candidate labels - const softmaxEach = multi_label || candidate_labels.length === 1; - - /** @type {ZeroShotClassificationOutput[]} */ - const toReturn = []; - for (const premise of texts) { - const entails_logits = []; - - for (const hypothesis of hypotheses) { - const inputs = this.tokenizer(premise, { - text_pair: hypothesis, - padding: true, - truncation: true, - }) - const outputs = await this.model(inputs) - - if (softmaxEach) { - entails_logits.push([ - outputs.logits.data[this.contradiction_id], - outputs.logits.data[this.entailment_id] - ]) - } else { - entails_logits.push(outputs.logits.data[this.entailment_id]) - } - } - - /** @type {number[]} */ - const scores = softmaxEach - ? entails_logits.map(x => softmax(x)[1]) - : softmax(entails_logits); - - // Sort by scores (desc) and return scores with indices - const scores_sorted = scores - .map((x, i) => [x, i]) - .sort((a, b) => (b[0] - a[0])); - - toReturn.push({ - sequence: premise, - labels: scores_sorted.map(x => candidate_labels[x[1]]), - scores: scores_sorted.map(x => x[0]), - }); - } - return isBatched ? toReturn : toReturn[0]; + this.contradiction_id = + this.label2id["contradiction"] ?? this.label2id["not_entailment"]; + if (this.contradiction_id === undefined) { + console.warn( + "Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id.", + ); + this.contradiction_id = 0; } + } + + /** @type {ZeroShotClassificationPipelineCallback} */ + async _call( + texts, + candidate_labels, + { hypothesis_template = "This example is {}.", multi_label = false } = {}, + ) { + const isBatched = Array.isArray(texts); + if (!isBatched) { + texts = [/** @type {string} */ (texts)]; + } + if (!Array.isArray(candidate_labels)) { + candidate_labels = [candidate_labels]; + } + + // Insert labels into hypothesis template + const hypotheses = candidate_labels.map((x) => + hypothesis_template.replace("{}", x), + ); + + // How to perform the softmax over the logits: + // - true: softmax over the entailment vs. contradiction dim for each label independently + // - false: softmax the "entailment" logits over all candidate labels + const softmaxEach = multi_label || candidate_labels.length === 1; + + /** @type {ZeroShotClassificationOutput[]} */ + const toReturn = []; + for (const premise of texts) { + const entails_logits = []; + + for (const hypothesis of hypotheses) { + const inputs = this.tokenizer(premise, { + text_pair: hypothesis, + padding: true, + truncation: true, + }); + const outputs = await this.model(inputs); + + if (softmaxEach) { + entails_logits.push([ + outputs.logits.data[this.contradiction_id], + outputs.logits.data[this.entailment_id], + ]); + } else { + entails_logits.push(outputs.logits.data[this.entailment_id]); + } + } + + /** @type {number[]} */ + const scores = softmaxEach + ? entails_logits.map((x) => softmax(x)[1]) + : softmax(entails_logits); + + // Sort by scores (desc) and return scores with indices + const scores_sorted = scores + .map((x, i) => [x, i]) + .sort((a, b) => b[0] - a[0]); + + toReturn.push({ + sequence: premise, + labels: scores_sorted.map((x) => candidate_labels[x[1]]), + scores: scores_sorted.map((x) => x[0]), + }); + } + return isBatched ? toReturn : toReturn[0]; + } } /** * @typedef {Object} FeatureExtractionPipelineOptions Parameters specific to feature extraction pipelines. * @property {'none'|'mean'|'cls'} [pooling="none"] The pooling method to use. * @property {boolean} [normalize=false] Whether or not to normalize the embeddings in the last dimension. - * + * * @callback FeatureExtractionPipelineCallback Extract the features of the input(s). * @param {string|string[]} texts One or several texts (or one list of texts) to get the features of. * @param {FeatureExtractionPipelineOptions} [options] The options to use for feature extraction. * @returns {Promise} The features computed by the model. - * + * * @typedef {TextPipelineConstructorArgs & FeatureExtractionPipelineCallback & Disposable} FeatureExtractionPipelineType */ /** * Feature extraction pipeline using no model head. This pipeline extracts the hidden * states from the base transformer, which can be used as features in downstream tasks. - * + * * **Example:** Run feature extraction with `bert-base-uncased` (without pooling/normalization). * ```javascript * const extractor = await pipeline('feature-extraction', 'Xenova/bert-base-uncased', { revision: 'default' }); @@ -1135,7 +1157,7 @@ export class ZeroShotClassificationPipeline extends (/** @type {new (options: Te * // dims: [1, 8, 768] * // } * ``` - * + * * **Example:** Run feature extraction with `bert-base-uncased` (with pooling/normalization). * ```javascript * const extractor = await pipeline('feature-extraction', 'Xenova/bert-base-uncased', { revision: 'default' }); @@ -1146,7 +1168,7 @@ export class ZeroShotClassificationPipeline extends (/** @type {new (options: Te * // dims: [1, 768] * // } * ``` - * + * * **Example:** Calculating embeddings with `sentence-transformers` models. * ```javascript * const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2'); @@ -1158,53 +1180,56 @@ export class ZeroShotClassificationPipeline extends (/** @type {new (options: Te * // } * ``` */ -export class FeatureExtractionPipeline extends (/** @type {new (options: TextPipelineConstructorArgs) => FeatureExtractionPipelineType} */ (Pipeline)) { - /** - * Create a new FeatureExtractionPipeline. - * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); +export class FeatureExtractionPipeline + extends /** @type {new (options: TextPipelineConstructorArgs) => FeatureExtractionPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new FeatureExtractionPipeline. + * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } + + /** @type {FeatureExtractionPipelineCallback} */ + async _call( + texts, + { pooling = /** @type {'none'} */ ("none"), normalize = false } = {}, + ) { + // Run tokenization + const model_inputs = this.tokenizer(texts, { + padding: true, + truncation: true, + }); + + // Run model + const outputs = await this.model(model_inputs); + + // TODO: Provide warning to the user that they might be using model which was not exported + // specifically for feature extraction + // console.log(this.model.config) + // console.log(outputs) + + /** @type {Tensor} */ + let result = outputs.last_hidden_state ?? outputs.logits; + if (pooling === "none") { + // Skip pooling + } else if (pooling === "mean") { + result = mean_pooling(result, model_inputs.attention_mask); + } else if (pooling === "cls") { + result = result.slice(null, 0); + } else { + throw Error(`Pooling method '${pooling}' not supported.`); } - /** @type {FeatureExtractionPipelineCallback} */ - async _call(texts, { - pooling = /** @type {'none'} */('none'), - normalize = false, - } = {}) { - - // Run tokenization - const model_inputs = this.tokenizer(texts, { - padding: true, - truncation: true, - }); - - // Run model - const outputs = await this.model(model_inputs) - - // TODO: Provide warning to the user that they might be using model which was not exported - // specifically for feature extraction - // console.log(this.model.config) - // console.log(outputs) - - /** @type {Tensor} */ - let result = outputs.last_hidden_state ?? outputs.logits; - if (pooling === 'none') { - // Skip pooling - } else if (pooling === 'mean') { - result = mean_pooling(result, model_inputs.attention_mask); - } else if (pooling === 'cls') { - result = result.slice(null, 0); - } else { - throw Error(`Pooling method '${pooling}' not supported.`); - } - - if (normalize) { - result = result.normalize(2, -1); - } - - return result; + if (normalize) { + result = result.normalize(2, -1); } + + return result; + } } // TODO @@ -1216,12 +1241,12 @@ export class FeatureExtractionPipeline extends (/** @type {new (options: TextPip * @property {string} label The label predicted. * @property {number} score The corresponding probability. * @typedef {AudioClassificationSingle[]} AudioClassificationOutput - * + * * @typedef {Object} AudioClassificationPipelineOptions Parameters specific to audio classification pipelines. * @property {number} [topk=null] The number of top labels that will be returned by the pipeline. * If the provided number is `null` or higher than the number of labels available in the model configuration, * it will default to the number of labels. - * + * * @callback AudioClassificationPipelineCallback Classify the sequence(s) given as inputs. * @param {AudioPipelineInputs} audio The input audio file(s) to be classified. The input is either: * - `string` or `URL` that is the filename/URL of the audio file, the file will be read at the processor's sampling rate @@ -1230,14 +1255,14 @@ export class FeatureExtractionPipeline extends (/** @type {new (options: TextPip * - `Float32Array` or `Float64Array` of shape `(n, )`, representing the raw audio at the correct sampling rate (no further check will be done). * @param {AudioClassificationPipelineOptions} [options] The options to use for audio classification. * @returns {Promise} An array or object containing the predicted labels and scores. - * + * * @typedef {AudioPipelineConstructorArgs & AudioClassificationPipelineCallback & Disposable} AudioClassificationPipelineType */ /** * Audio classification pipeline using any `AutoModelForAudioClassification`. * This pipeline predicts the class of a raw waveform or an audio file. - * + * * **Example:** Perform audio classification with `Xenova/wav2vec2-large-xlsr-53-gender-recognition-librispeech`. * ```javascript * const classifier = await pipeline('audio-classification', 'Xenova/wav2vec2-large-xlsr-53-gender-recognition-librispeech'); @@ -1248,7 +1273,7 @@ export class FeatureExtractionPipeline extends (/** @type {new (options: TextPip * // { label: 'female', score: 0.001845747814513743 } * // ] * ``` - * + * * **Example:** Perform audio classification with `Xenova/ast-finetuned-audioset-10-10-0.4593` and return top 4 results. * ```javascript * const classifier = await pipeline('audio-classification', 'Xenova/ast-finetuned-audioset-10-10-0.4593'); @@ -1262,61 +1287,63 @@ export class FeatureExtractionPipeline extends (/** @type {new (options: TextPip * // ] * ``` */ -export class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelineConstructorArgs) => AudioClassificationPipelineType} */ (Pipeline)) { +export class AudioClassificationPipeline + extends /** @type {new (options: AudioPipelineConstructorArgs) => AudioClassificationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new AudioClassificationPipeline. + * @param {AudioPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new AudioClassificationPipeline. - * @param {AudioPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); - } - - /** @type {AudioClassificationPipelineCallback} */ - async _call(audio, { - topk = null - } = {}) { - - const single = !Array.isArray(audio); - - const sampling_rate = this.processor.feature_extractor.config.sampling_rate; - const preparedAudios = await prepareAudios(audio, sampling_rate); - - const id2label = this.model.config.id2label; - - const toReturn = []; - for (const aud of preparedAudios) { - const inputs = await this.processor(aud); - const output = await this.model(inputs); - const logits = output.logits[0]; - - const scores = getTopItems(softmax(logits.data), topk); - - const vals = scores.map(x => ({ - label: /** @type {string} */ (id2label[x[0]]), - score: /** @type {number} */ (x[1]), - })); - - if (topk === 1) { - toReturn.push(...vals); - } else { - toReturn.push(vals); - } - } - return !single || topk === 1 ? /** @type {AudioClassificationOutput} */ (toReturn) : /** @type {AudioClassificationOutput[]} */ (toReturn)[0]; + /** @type {AudioClassificationPipelineCallback} */ + async _call(audio, { topk = null } = {}) { + const single = !Array.isArray(audio); + + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; + const preparedAudios = await prepareAudios(audio, sampling_rate); + + const id2label = this.model.config.id2label; + + const toReturn = []; + for (const aud of preparedAudios) { + const inputs = await this.processor(aud); + const output = await this.model(inputs); + const logits = output.logits[0]; + + const scores = getTopItems(softmax(logits.data), topk); + + const vals = scores.map((x) => ({ + label: /** @type {string} */ (id2label[x[0]]), + score: /** @type {number} */ (x[1]), + })); + + if (topk === 1) { + toReturn.push(...vals); + } else { + toReturn.push(vals); + } } + return !single || topk === 1 + ? /** @type {AudioClassificationOutput} */ (toReturn) + : /** @type {AudioClassificationOutput[]} */ (toReturn)[0]; + } } /** * @typedef {Object} ZeroShotAudioClassificationOutput * @property {string} label The label identified by the model. It is one of the suggested `candidate_label`. * @property {number} score The score attributed by the model for that label (between 0 and 1). - * + * * @typedef {Object} ZeroShotAudioClassificationPipelineOptions Parameters specific to zero-shot audio classification pipelines. * @property {string} [hypothesis_template="This is a sound of {}."] The sentence used in conjunction with `candidate_labels` * to attempt the audio classification by replacing the placeholder with the candidate_labels. * Then likelihood is estimated by using `logits_per_audio`. - * + * * @callback ZeroShotAudioClassificationPipelineCallback Classify the sequence(s) given as inputs. * @param {AudioPipelineInputs} audio The input audio file(s) to be classified. The input is either: * - `string` or `URL` that is the filename/URL of the audio file, the file will be read at the processor's sampling rate @@ -1326,14 +1353,14 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio * @param {string[]} candidate_labels The candidate labels for this audio. * @param {ZeroShotAudioClassificationPipelineOptions} [options] The options to use for zero-shot audio classification. * @returns {Promise} An array of objects containing the predicted labels and scores. - * + * * @typedef {TextAudioPipelineConstructorArgs & ZeroShotAudioClassificationPipelineCallback & Disposable} ZeroShotAudioClassificationPipelineType */ /** * Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you * provide an audio and a set of `candidate_labels`. - * + * * **Example**: Perform zero-shot audio classification with `Xenova/clap-htsat-unfused`. * ```javascript * const classifier = await pipeline('zero-shot-audio-classification', 'Xenova/clap-htsat-unfused'); @@ -1346,57 +1373,63 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio * // ] * ``` */ -export class ZeroShotAudioClassificationPipeline extends (/** @type {new (options: TextAudioPipelineConstructorArgs) => ZeroShotAudioClassificationPipelineType} */ (Pipeline)) { +export class ZeroShotAudioClassificationPipeline + extends /** @type {new (options: TextAudioPipelineConstructorArgs) => ZeroShotAudioClassificationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new ZeroShotAudioClassificationPipeline. + * @param {TextAudioPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new ZeroShotAudioClassificationPipeline. - * @param {TextAudioPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); + /** @type {ZeroShotAudioClassificationPipelineCallback} */ + async _call( + audio, + candidate_labels, + { hypothesis_template = "This is a sound of {}." } = {}, + ) { + const single = !Array.isArray(audio); + if (single) { + audio = [/** @type {AudioInput} */ (audio)]; } - /** @type {ZeroShotAudioClassificationPipelineCallback} */ - async _call(audio, candidate_labels, { - hypothesis_template = "This is a sound of {}." - } = {}) { + // Insert label into hypothesis template + const texts = candidate_labels.map((x) => + hypothesis_template.replace("{}", x), + ); - const single = !Array.isArray(audio); - if (single) { - audio = [/** @type {AudioInput} */ (audio)]; - } + // Run tokenization + const text_inputs = this.tokenizer(texts, { + padding: true, + truncation: true, + }); - // Insert label into hypothesis template - const texts = candidate_labels.map( - x => hypothesis_template.replace('{}', x) - ); + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; + const preparedAudios = await prepareAudios(audio, sampling_rate); - // Run tokenization - const text_inputs = this.tokenizer(texts, { - padding: true, - truncation: true, - }); + const toReturn = []; + for (const aud of preparedAudios) { + const audio_inputs = await this.processor(aud); - const sampling_rate = this.processor.feature_extractor.config.sampling_rate; - const preparedAudios = await prepareAudios(audio, sampling_rate); + // Run model with both text and audio inputs + const output = await this.model({ ...text_inputs, ...audio_inputs }); - const toReturn = []; - for (const aud of preparedAudios) { - const audio_inputs = await this.processor(aud); + // Compute softmax per audio + const probs = softmax(output.logits_per_audio.data); - // Run model with both text and audio inputs - const output = await this.model({ ...text_inputs, ...audio_inputs }); - - // Compute softmax per audio - const probs = softmax(output.logits_per_audio.data); - - toReturn.push([...probs].map((x, i) => ({ - score: x, - label: candidate_labels[i] - }))); - } - return single ? toReturn[0] : toReturn; + toReturn.push( + [...probs].map((x, i) => ({ + score: x, + label: candidate_labels[i], + })), + ); } + return single ? toReturn[0] : toReturn; + } } /** @@ -1416,7 +1449,7 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * @property {string} text The recognized text. * @property {Chunk[]} [chunks] When using `return_timestamps`, the `chunks` will become a list * containing all the various text chunks identified by the model. - * + * * @typedef {Object} AutomaticSpeechRecognitionSpecificParams Parameters specific to automatic-speech-recognition pipelines. * @property {boolean|'word'} [kwargs.return_timestamps] Whether to return timestamps or not. Default is `false`. * @property {number} [kwargs.chunk_length_s] The length of audio chunks to process in seconds. Default is 0 (no chunking). @@ -1429,7 +1462,7 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * that will be forced before sampling. For example, [[1, 123]] means the second generated token will always be a token of index 123. * @property {number} [num_frames] The number of frames in the input audio. * @typedef {import('./utils/generation.js').GenerationConfigType & AutomaticSpeechRecognitionSpecificParams} AutomaticSpeechRecognitionConfig - * + * * @callback AutomaticSpeechRecognitionPipelineCallback Transcribe the audio sequence(s) given as inputs to text. * @param {AudioPipelineInputs} audio The input audio file(s) to be transcribed. The input is either: * - `string` or `URL` that is the filename/URL of the audio file, the file will be read at the processor's sampling rate @@ -1438,7 +1471,7 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * - `Float32Array` or `Float64Array` of shape `(n, )`, representing the raw audio at the correct sampling rate (no further check will be done). * @param {AutomaticSpeechRecognitionConfig} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} An object containing the transcription text and optionally timestamps if `return_timestamps` is `true`. - * + * * @typedef {TextAudioPipelineConstructorArgs & AutomaticSpeechRecognitionPipelineCallback & Disposable} AutomaticSpeechRecognitionPipelineType */ @@ -1452,7 +1485,7 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * const output = await transcriber(url); * // { text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country." } * ``` - * + * * **Example:** Transcribe English w/ timestamps. * ```javascript * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en'); @@ -1466,7 +1499,7 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * // ] * // } * ``` - * + * * **Example:** Transcribe English w/ word-level timestamps. * ```javascript * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en'); @@ -1485,7 +1518,7 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * // ] * // } * ``` - * + * * **Example:** Transcribe French. * ```javascript * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-small'); @@ -1493,7 +1526,7 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * const output = await transcriber(url, { language: 'french', task: 'transcribe' }); * // { text: " J'adore, j'aime, je n'aime pas, je dรฉteste." } * ``` - * + * * **Example:** Translate French to English. * ```javascript * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-small'); @@ -1501,7 +1534,7 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * const output = await transcriber(url, { language: 'french', task: 'translate' }); * // { text: " I love, I like, I don't like, I hate." } * ``` - * + * * **Example:** Transcribe/translate audio longer than 30 seconds. * ```javascript * const transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en'); @@ -1510,208 +1543,224 @@ export class ZeroShotAudioClassificationPipeline extends (/** @type {new (option * // { text: " So in college, I was a government major, which means [...] So I'd start off light and I'd bump it up" } * ``` */ -export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextAudioPipelineConstructorArgs) => AutomaticSpeechRecognitionPipelineType} */ (Pipeline)) { +export class AutomaticSpeechRecognitionPipeline + extends /** @type {new (options: TextAudioPipelineConstructorArgs) => AutomaticSpeechRecognitionPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new AutomaticSpeechRecognitionPipeline. + * @param {TextAudioPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new AutomaticSpeechRecognitionPipeline. - * @param {TextAudioPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); + /** @type {AutomaticSpeechRecognitionPipelineCallback} */ + async _call(audio, kwargs = {}) { + switch (this.model.config.model_type) { + case "whisper": + return this._call_whisper(audio, kwargs); + case "wav2vec2": + case "hubert": + return this._call_wav2vec2(audio, kwargs); + default: + throw new Error( + `AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`, + ); + } + } + + /** + * @type {AutomaticSpeechRecognitionPipelineCallback} + * @private + */ + async _call_wav2vec2(audio, kwargs = {}) { + // TODO use kwargs + + if (kwargs.language) { + console.warn( + '`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".', + ); + } + if (kwargs.task) { + console.warn( + '`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".', + ); } - /** @type {AutomaticSpeechRecognitionPipelineCallback} */ - async _call(audio, kwargs = {}) { - switch (this.model.config.model_type) { - case 'whisper': - return this._call_whisper(audio, kwargs) - case 'wav2vec2': - case 'hubert': - return this._call_wav2vec2(audio, kwargs) - default: - throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`) - } + const single = !Array.isArray(audio); + if (single) { + audio = [/** @type {AudioInput} */ (audio)]; } - /** - * @type {AutomaticSpeechRecognitionPipelineCallback} - * @private - */ - async _call_wav2vec2(audio, kwargs = {}) { - // TODO use kwargs + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; + const preparedAudios = await prepareAudios(audio, sampling_rate); - if (kwargs.language) { - console.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".'); - } - if (kwargs.task) { - console.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".'); - } + const toReturn = []; + for (const aud of preparedAudios) { + const inputs = await this.processor(aud); + const output = await this.model(inputs); + const logits = output.logits[0]; - const single = !Array.isArray(audio); - if (single) { - audio = [/** @type {AudioInput} */ (audio)]; - } + const predicted_ids = []; + for (const item of logits) { + predicted_ids.push(max(item.data)[1]); + } + const predicted_sentences = this.tokenizer.decode(predicted_ids); + toReturn.push({ text: predicted_sentences }); + } + return single ? toReturn[0] : toReturn; + } - const sampling_rate = this.processor.feature_extractor.config.sampling_rate; - const preparedAudios = await prepareAudios(audio, sampling_rate); + /** + * @type {AutomaticSpeechRecognitionPipelineCallback} + * @private + */ + async _call_whisper(audio, kwargs = {}) { + const return_timestamps = kwargs.return_timestamps ?? false; + const chunk_length_s = kwargs.chunk_length_s ?? 0; + const chunk_callback = kwargs.chunk_callback ?? null; + const force_full_sequences = kwargs.force_full_sequences ?? false; + let stride_length_s = kwargs.stride_length_s ?? null; - const toReturn = []; - for (const aud of preparedAudios) { - const inputs = await this.processor(aud); - const output = await this.model(inputs); - const logits = output.logits[0]; - - const predicted_ids = []; - for (const item of logits) { - predicted_ids.push(max(item.data)[1]) - } - const predicted_sentences = this.tokenizer.decode(predicted_ids) - toReturn.push({ text: predicted_sentences }) - } - return single ? toReturn[0] : toReturn; + if (return_timestamps === "word") { + kwargs["return_token_timestamps"] = true; } - /** - * @type {AutomaticSpeechRecognitionPipelineCallback} - * @private - */ - async _call_whisper(audio, kwargs = {}) { + const language = pop(kwargs, "language", null); + const task = pop(kwargs, "task", null); - const return_timestamps = kwargs.return_timestamps ?? false; - const chunk_length_s = kwargs.chunk_length_s ?? 0; - const chunk_callback = kwargs.chunk_callback ?? null; - const force_full_sequences = kwargs.force_full_sequences ?? false; - let stride_length_s = kwargs.stride_length_s ?? null; - - if (return_timestamps === 'word') { - kwargs['return_token_timestamps'] = true; - } - - const language = pop(kwargs, 'language', null); - const task = pop(kwargs, 'task', null); - - if (language || task || return_timestamps) { - if (kwargs.forced_decoder_ids) { - throw new Error("Cannot specify `language`/`task`/`return_timestamps` and `forced_decoder_ids` at the same time.") - } - // @ts-ignore - const decoder_prompt_ids = this.tokenizer.get_decoder_prompt_ids({ language, task, no_timestamps: !return_timestamps }) - if (decoder_prompt_ids.length > 0) { - kwargs.forced_decoder_ids = decoder_prompt_ids; - } - } - - const single = !Array.isArray(audio); - if (single) { - audio = [/** @type {AudioInput} */ (audio)]; - } - - const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions; - const hop_length = this.processor.feature_extractor.config.hop_length; - - const sampling_rate = this.processor.feature_extractor.config.sampling_rate; - const preparedAudios = await prepareAudios(audio, sampling_rate); - - const toReturn = []; - for (const aud of preparedAudios) { - /** @type {ChunkCallbackItem[]} */ - let chunks = []; - if (chunk_length_s > 0) { - if (stride_length_s === null) { - stride_length_s = chunk_length_s / 6; - } else if (chunk_length_s <= stride_length_s) { - throw Error("`chunk_length_s` must be larger than `stride_length_s`.") - } - - // TODO support different stride_length_s (for left and right) - - const window = sampling_rate * chunk_length_s; - const stride = sampling_rate * stride_length_s; - const jump = window - 2 * stride; - let offset = 0; - - // Create subarrays of audio with overlaps - - while (offset < aud.length) { - const subarr = aud.subarray(offset, offset + window); - const feature = await this.processor(subarr); - - const isFirst = offset === 0; - const isLast = offset + jump >= aud.length; - chunks.push({ - stride: [ - subarr.length, - isFirst ? 0 : stride, - isLast ? 0 : stride - ], - input_features: feature.input_features, - is_last: isLast - }) - offset += jump; - } - - } else { - chunks = [{ - stride: [aud.length, 0, 0], - input_features: (await this.processor(aud)).input_features, - is_last: true - }] - } - - // Generate for each set of input features - for (const chunk of chunks) { - kwargs.num_frames = Math.floor(chunk.stride[0] / hop_length); - - // NOTE: doing sequentially for now - const data = await this.model.generate(chunk.input_features, kwargs); - - // TODO: Right now we only get top beam - if (return_timestamps === 'word') { - chunk.tokens = data.sequences[0]; - chunk.token_timestamps = data.token_timestamps.tolist()[0].map( - (/** @type {number} */ x) => round(x, 2) - ); - - } else { - chunk.tokens = data[0]; - } - - // convert stride to seconds - chunk.stride = chunk.stride.map(x => x / sampling_rate); - - if (chunk_callback !== null) { - chunk_callback(chunk) - } - } - - // Merge text chunks - // @ts-ignore - const [full_text, optional] = this.tokenizer._decode_asr(chunks, { - time_precision, return_timestamps, force_full_sequences - }); - - toReturn.push({ text: full_text, ...optional }) - } - return single ? toReturn[0] : toReturn; + if (language || task || return_timestamps) { + if (kwargs.forced_decoder_ids) { + throw new Error( + "Cannot specify `language`/`task`/`return_timestamps` and `forced_decoder_ids` at the same time.", + ); + } + // @ts-ignore + const decoder_prompt_ids = this.tokenizer.get_decoder_prompt_ids({ + language, + task, + no_timestamps: !return_timestamps, + }); + if (decoder_prompt_ids.length > 0) { + kwargs.forced_decoder_ids = decoder_prompt_ids; + } } + + const single = !Array.isArray(audio); + if (single) { + audio = [/** @type {AudioInput} */ (audio)]; + } + + const time_precision = + this.processor.feature_extractor.config.chunk_length / + this.model.config.max_source_positions; + const hop_length = this.processor.feature_extractor.config.hop_length; + + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; + const preparedAudios = await prepareAudios(audio, sampling_rate); + + const toReturn = []; + for (const aud of preparedAudios) { + /** @type {ChunkCallbackItem[]} */ + let chunks = []; + if (chunk_length_s > 0) { + if (stride_length_s === null) { + stride_length_s = chunk_length_s / 6; + } else if (chunk_length_s <= stride_length_s) { + throw Error( + "`chunk_length_s` must be larger than `stride_length_s`.", + ); + } + + // TODO support different stride_length_s (for left and right) + + const window = sampling_rate * chunk_length_s; + const stride = sampling_rate * stride_length_s; + const jump = window - 2 * stride; + let offset = 0; + + // Create subarrays of audio with overlaps + + while (offset < aud.length) { + const subarr = aud.subarray(offset, offset + window); + const feature = await this.processor(subarr); + + const isFirst = offset === 0; + const isLast = offset + jump >= aud.length; + chunks.push({ + stride: [subarr.length, isFirst ? 0 : stride, isLast ? 0 : stride], + input_features: feature.input_features, + is_last: isLast, + }); + offset += jump; + } + } else { + chunks = [ + { + stride: [aud.length, 0, 0], + input_features: (await this.processor(aud)).input_features, + is_last: true, + }, + ]; + } + + // Generate for each set of input features + for (const chunk of chunks) { + kwargs.num_frames = Math.floor(chunk.stride[0] / hop_length); + + // NOTE: doing sequentially for now + const data = await this.model.generate(chunk.input_features, kwargs); + + // TODO: Right now we only get top beam + if (return_timestamps === "word") { + chunk.tokens = data.sequences[0]; + chunk.token_timestamps = data.token_timestamps + .tolist()[0] + .map((/** @type {number} */ x) => round(x, 2)); + } else { + chunk.tokens = data[0]; + } + + // convert stride to seconds + chunk.stride = chunk.stride.map((x) => x / sampling_rate); + + if (chunk_callback !== null) { + chunk_callback(chunk); + } + } + + // Merge text chunks + // @ts-ignore + const [full_text, optional] = this.tokenizer._decode_asr(chunks, { + time_precision, + return_timestamps, + force_full_sequences, + }); + + toReturn.push({ text: full_text, ...optional }); + } + return single ? toReturn[0] : toReturn; + } } /** * @typedef {Object} ImageToTextSingle * @property {string} generated_text The generated text. * @typedef {ImageToTextSingle[]} ImageToTextOutput - * + * * @callback ImageToTextPipelineCallback Assign labels to the image(s) passed as inputs. * @param {ImagePipelineInputs} texts The images to be captioned. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} An object (or array of objects) containing the generated text(s). - * + * * @typedef {TextImagePipelineConstructorArgs & ImageToTextPipelineCallback & Disposable} ImageToTextPipelineType */ /** * Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image. - * + * * **Example:** Generate a caption for an image w/ `Xenova/vit-gpt2-image-captioning`. * ```javascript * const captioner = await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning'); @@ -1719,7 +1768,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options * const output = await captioner(url); * // [{ generated_text: 'a cat laying on a couch with another cat' }] * ``` - * + * * **Example:** Optical Character Recognition (OCR) w/ `Xenova/trocr-small-handwritten`. * ```javascript * const captioner = await pipeline('image-to-text', 'Xenova/trocr-small-handwritten'); @@ -1728,36 +1777,40 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options * // [{ generated_text: 'Mr. Brown commented icily.' }] * ``` */ -export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ImageToTextPipelineType} */ (Pipeline)) { +export class ImageToTextPipeline + extends /** @type {new (options: TextImagePipelineConstructorArgs) => ImageToTextPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new ImageToTextPipeline. + * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new ImageToTextPipeline. - * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); + /** @type {ImageToTextPipelineCallback} */ + async _call(images, generate_kwargs = {}) { + const isBatched = Array.isArray(images); + const preparedImages = await prepareImages(images); + + const { pixel_values } = await this.processor(preparedImages); + + const toReturn = []; + for (const batch of pixel_values) { + batch.dims = [1, ...batch.dims]; + const output = await this.model.generate(batch, generate_kwargs); + const decoded = this.tokenizer + .batch_decode(output, { + skip_special_tokens: true, + }) + .map((x) => ({ generated_text: x.trim() })); + toReturn.push(decoded); } - /** @type {ImageToTextPipelineCallback} */ - async _call(images, generate_kwargs = {}) { - - const isBatched = Array.isArray(images); - const preparedImages = await prepareImages(images); - - const { pixel_values } = await this.processor(preparedImages); - - const toReturn = []; - for (const batch of pixel_values) { - batch.dims = [1, ...batch.dims] - const output = await this.model.generate(batch, generate_kwargs); - const decoded = this.tokenizer.batch_decode(output, { - skip_special_tokens: true, - }).map(x => ({ generated_text: x.trim() })) - toReturn.push(decoded); - } - - return isBatched ? toReturn : toReturn[0]; - } + return isBatched ? toReturn : toReturn[0]; + } } /** @@ -1765,22 +1818,22 @@ export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipe * @property {string} label The label identified by the model. * @property {number} score The score attributed by the model for that label. * @typedef {ImageClassificationSingle[]} ImageClassificationOutput - * + * * @typedef {Object} ImageClassificationPipelineOptions Parameters specific to image classification pipelines. - * @property {number} [topk=1] The number of top labels that will be returned by the pipeline. - * + * @property {number} [topk=1] The number of top labels that will be returned by the pipeline. + * * @callback ImageClassificationPipelineCallback Assign labels to the image(s) passed as inputs. * @param {ImagePipelineInputs} images The input images(s) to be classified. * @param {ImageClassificationPipelineOptions} [options] The options to use for image classification. * @returns {Promise} An array or object containing the predicted labels and scores. - * + * * @typedef {ImagePipelineConstructorArgs & ImageClassificationPipelineCallback & Disposable} ImageClassificationPipelineType */ /** * Image classification pipeline using any `AutoModelForImageClassification`. * This pipeline predicts the class of an image. - * + * * **Example:** Classify an image. * ```javascript * const classifier = await pipeline('image-classification', 'Xenova/vit-base-patch16-224'); @@ -1790,7 +1843,7 @@ export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipe * // { label: 'tiger, Panthera tigris', score: 0.632695734500885 }, * // ] * ``` - * + * * **Example:** Classify an image and return top `n` classes. * ```javascript * const classifier = await pipeline('image-classification', 'Xenova/vit-base-patch16-224'); @@ -1802,7 +1855,7 @@ export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipe * // { label: 'lion, king of beasts, Panthera leo', score: 0.00045060308184474707 }, * // ] * ``` - * + * * **Example:** Classify an image and return all classes. * ```javascript * const classifier = await pipeline('image-classification', 'Xenova/vit-base-patch16-224'); @@ -1817,46 +1870,47 @@ export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipe * // ] * ``` */ -export class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageClassificationPipelineType} */ (Pipeline)) { +export class ImageClassificationPipeline + extends /** @type {new (options: ImagePipelineConstructorArgs) => ImageClassificationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new ImageClassificationPipeline. + * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new ImageClassificationPipeline. - * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); - } - - /** @type {ImageClassificationPipelineCallback} */ - async _call(images, { - topk = 1 - } = {}) { - - const isBatched = Array.isArray(images); - const preparedImages = await prepareImages(images); - - const { pixel_values } = await this.processor(preparedImages); - const output = await this.model({ pixel_values }); - - const id2label = this.model.config.id2label; - const toReturn = []; - for (const batch of output.logits) { - const scores = getTopItems(softmax(batch.data), topk); - - const vals = scores.map(x => ({ - label: id2label[x[0]], - score: x[1], - })); - if (topk === 1) { - toReturn.push(...vals); - } else { - toReturn.push(vals); - } - } - - return isBatched || topk === 1 ? /** @type {ImageClassificationOutput} */ (toReturn) : /** @type {ImageClassificationOutput[]} */ (toReturn)[0]; + /** @type {ImageClassificationPipelineCallback} */ + async _call(images, { topk = 1 } = {}) { + const isBatched = Array.isArray(images); + const preparedImages = await prepareImages(images); + + const { pixel_values } = await this.processor(preparedImages); + const output = await this.model({ pixel_values }); + + const id2label = this.model.config.id2label; + const toReturn = []; + for (const batch of output.logits) { + const scores = getTopItems(softmax(batch.data), topk); + + const vals = scores.map((x) => ({ + label: id2label[x[0]], + score: x[1], + })); + if (topk === 1) { + toReturn.push(...vals); + } else { + toReturn.push(vals); + } } + return isBatched || topk === 1 + ? /** @type {ImageClassificationOutput} */ (toReturn) + : /** @type {ImageClassificationOutput[]} */ (toReturn)[0]; + } } /** @@ -1864,7 +1918,7 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image * @property {string} label The label of the segment. * @property {number|null} score The score of the segment. * @property {RawImage} mask The mask of the segment. - * + * * @typedef {Object} ImageSegmentationPipelineOptions Parameters specific to image segmentation pipelines. * @property {number} [threshold=0.5] Probability threshold to filter out predicted masks. * @property {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values. @@ -1873,19 +1927,19 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image * depending on model capabilities. If not set, the pipeline will attempt to resolve (in that order). * @property {number[]} [label_ids_to_fuse=null] List of label ids to fuse. If not set, do not fuse any labels. * @property {number[][]} [target_sizes=null] List of target sizes for the input images. If not set, use the original image sizes. - * + * * @callback ImageSegmentationPipelineCallback Segment the input images. * @param {ImagePipelineInputs} images The input images. * @param {ImageSegmentationPipelineOptions} [options] The options to use for image segmentation. * @returns {Promise} The annotated segments. - * + * * @typedef {ImagePipelineConstructorArgs & ImageSegmentationPipelineCallback & Disposable} ImageSegmentationPipelineType */ /** * Image segmentation pipeline using any `AutoModelForXXXSegmentation`. * This pipeline predicts masks of objects and their classes. - * + * * **Example:** Perform image segmentation with `Xenova/detr-resnet-50-panoptic`. * ```javascript * const segmenter = await pipeline('image-segmentation', 'Xenova/detr-resnet-50-panoptic'); @@ -1897,139 +1951,152 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image * // ] * ``` */ -export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageSegmentationPipelineType} */ (Pipeline)) { - /** - * Create a new ImageSegmentationPipeline. - * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); +export class ImageSegmentationPipeline + extends /** @type {new (options: ImagePipelineConstructorArgs) => ImageSegmentationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new ImageSegmentationPipeline. + * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); - this.subtasks_mapping = { - // Mapping of subtasks to their corresponding post-processing function names. - panoptic: 'post_process_panoptic_segmentation', - instance: 'post_process_instance_segmentation', - semantic: 'post_process_semantic_segmentation' - } + this.subtasks_mapping = { + // Mapping of subtasks to their corresponding post-processing function names. + panoptic: "post_process_panoptic_segmentation", + instance: "post_process_instance_segmentation", + semantic: "post_process_semantic_segmentation", + }; + } + + /** @type {ImageSegmentationPipelineCallback} */ + async _call( + images, + { + threshold = 0.5, + mask_threshold = 0.5, + overlap_mask_area_threshold = 0.8, + label_ids_to_fuse = null, + target_sizes = null, + subtask = null, + } = {}, + ) { + const isBatched = Array.isArray(images); + + if (isBatched && images.length !== 1) { + throw Error( + "Image segmentation pipeline currently only supports a batch size of 1.", + ); } - /** @type {ImageSegmentationPipelineCallback} */ - async _call(images, { - threshold = 0.5, - mask_threshold = 0.5, - overlap_mask_area_threshold = 0.8, - label_ids_to_fuse = null, - target_sizes = null, - subtask = null, - } = {}) { - const isBatched = Array.isArray(images); + const preparedImages = await prepareImages(images); + const imageSizes = preparedImages.map((x) => [x.height, x.width]); - if (isBatched && images.length !== 1) { - throw Error("Image segmentation pipeline currently only supports a batch size of 1."); + const { pixel_values, pixel_mask } = await this.processor(preparedImages); + const output = await this.model({ pixel_values, pixel_mask }); + + let fn = null; + if (subtask !== null) { + fn = this.subtasks_mapping[subtask]; + } else { + for (let [task, func] of Object.entries(this.subtasks_mapping)) { + if (func in this.processor.feature_extractor) { + fn = this.processor.feature_extractor[func].bind( + this.processor.feature_extractor, + ); + subtask = task; + break; } - - const preparedImages = await prepareImages(images); - const imageSizes = preparedImages.map(x => [x.height, x.width]); - - const { pixel_values, pixel_mask } = await this.processor(preparedImages); - const output = await this.model({ pixel_values, pixel_mask }); - - let fn = null; - if (subtask !== null) { - fn = this.subtasks_mapping[subtask]; - } else { - for (let [task, func] of Object.entries(this.subtasks_mapping)) { - if (func in this.processor.feature_extractor) { - fn = this.processor.feature_extractor[func].bind(this.processor.feature_extractor); - subtask = task; - break; - } - } - } - - const id2label = this.model.config.id2label; - - /** @type {ImageSegmentationPipelineOutput[]} */ - const annotation = []; - if (subtask === 'panoptic' || subtask === 'instance') { - const processed = fn( - output, - threshold, - mask_threshold, - overlap_mask_area_threshold, - label_ids_to_fuse, - target_sizes ?? imageSizes, // TODO FIX? - )[0]; - - const segmentation = processed.segmentation; - - for (const segment of processed.segments_info) { - const maskData = new Uint8ClampedArray(segmentation.data.length); - for (let i = 0; i < segmentation.data.length; ++i) { - if (segmentation.data[i] === segment.id) { - maskData[i] = 255; - } - } - - // const mask = new RawImage(maskData, segmentation.dims[1], segmentation.dims[0], 1) - - annotation.push({ - score: segment.score, - label: id2label[segment.label_id], - // mask: mask - }) - } - - } else if (subtask === 'semantic') { - const { segmentation, labels } = fn(output, target_sizes ?? imageSizes)[0]; - - for (const label of labels) { - const maskData = new Uint8ClampedArray(segmentation.data.length); - for (let i = 0; i < segmentation.data.length; ++i) { - if (segmentation.data[i] === label) { - maskData[i] = 255; - } - } - - // const mask = new RawImage(maskData, segmentation.dims[1], segmentation.dims[0], 1); - - annotation.push({ - score: null, - label: id2label[label], - // mask: mask - }); - } - } else { - throw Error(`Subtask ${subtask} not supported.`); - } - - return annotation; + } } + + const id2label = this.model.config.id2label; + + /** @type {ImageSegmentationPipelineOutput[]} */ + const annotation = []; + if (subtask === "panoptic" || subtask === "instance") { + const processed = fn( + output, + threshold, + mask_threshold, + overlap_mask_area_threshold, + label_ids_to_fuse, + target_sizes ?? imageSizes, // TODO FIX? + )[0]; + + const segmentation = processed.segmentation; + + for (const segment of processed.segments_info) { + const maskData = new Uint8ClampedArray(segmentation.data.length); + for (let i = 0; i < segmentation.data.length; ++i) { + if (segmentation.data[i] === segment.id) { + maskData[i] = 255; + } + } + + // const mask = new RawImage(maskData, segmentation.dims[1], segmentation.dims[0], 1) + + annotation.push({ + score: segment.score, + label: id2label[segment.label_id], + // mask: mask + }); + } + } else if (subtask === "semantic") { + const { segmentation, labels } = fn( + output, + target_sizes ?? imageSizes, + )[0]; + + for (const label of labels) { + const maskData = new Uint8ClampedArray(segmentation.data.length); + for (let i = 0; i < segmentation.data.length; ++i) { + if (segmentation.data[i] === label) { + maskData[i] = 255; + } + } + + // const mask = new RawImage(maskData, segmentation.dims[1], segmentation.dims[0], 1); + + annotation.push({ + score: null, + label: id2label[label], + // mask: mask + }); + } + } else { + throw Error(`Subtask ${subtask} not supported.`); + } + + return annotation; + } } /** * @typedef {Object} ZeroShotImageClassificationOutput * @property {string} label The label identified by the model. It is one of the suggested `candidate_label`. * @property {number} score The score attributed by the model for that label (between 0 and 1). - * + * * @typedef {Object} ZeroShotImageClassificationPipelineOptions Parameters specific to zero-shot image classification pipelines. * @property {string} [hypothesis_template="This is a photo of {}"] The sentence used in conjunction with `candidate_labels` * to attempt the image classification by replacing the placeholder with the candidate_labels. * Then likelihood is estimated by using `logits_per_image`. - * + * * @callback ZeroShotImageClassificationPipelineCallback Assign labels to the image(s) passed as inputs. * @param {ImagePipelineInputs} images The input images. * @param {string[]} candidate_labels The candidate labels for this image. * @param {ZeroShotImageClassificationPipelineOptions} [options] The options to use for zero-shot image classification. * @returns {Promise} An array of objects containing the predicted labels and scores. - * + * * @typedef {TextImagePipelineConstructorArgs & ZeroShotImageClassificationPipelineCallback & Disposable} ZeroShotImageClassificationPipelineType */ /** * Zero shot image classification pipeline. This pipeline predicts the class of * an image when you provide an image and a set of `candidate_labels`. - * + * * **Example:** Zero shot image classification w/ `Xenova/clip-vit-base-patch32`. * ```javascript * const classifier = await pipeline('zero-shot-image-classification', 'Xenova/clip-vit-base-patch32'); @@ -2042,87 +2109,91 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi * // ] * ``` */ -export class ZeroShotImageClassificationPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotImageClassificationPipelineType} */ (Pipeline)) { - /** - * Create a new ZeroShotImageClassificationPipeline. - * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); +export class ZeroShotImageClassificationPipeline + extends /** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotImageClassificationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new ZeroShotImageClassificationPipeline. + * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } + + /** @type {ZeroShotImageClassificationPipelineCallback} */ + async _call( + images, + candidate_labels, + { hypothesis_template = "This is a photo of {}" } = {}, + ) { + const isBatched = Array.isArray(images); + const preparedImages = await prepareImages(images); + + // Insert label into hypothesis template + const texts = candidate_labels.map((x) => + hypothesis_template.replace("{}", x), + ); + + // Run tokenization + const text_inputs = this.tokenizer(texts, { + padding: this.model.config.model_type === "siglip" ? "max_length" : true, + truncation: true, + }); + + // Run processor + const { pixel_values } = await this.processor(preparedImages); + + // Run model with both text and pixel inputs + const output = await this.model({ ...text_inputs, pixel_values }); + + const function_to_apply = + this.model.config.model_type === "siglip" + ? (batch) => batch.sigmoid().data + : (batch) => softmax(batch.data); + + // Compare each image with each candidate label + const toReturn = []; + for (const batch of output.logits_per_image) { + // Compute softmax per image + const probs = function_to_apply(batch); + + const result = [...probs].map((x, i) => ({ + score: x, + label: candidate_labels[i], + })); + result.sort((a, b) => b.score - a.score); // sort by score in descending order + toReturn.push(result); } - /** @type {ZeroShotImageClassificationPipelineCallback} */ - async _call(images, candidate_labels, { - hypothesis_template = "This is a photo of {}" - } = {}) { - - const isBatched = Array.isArray(images); - const preparedImages = await prepareImages(images); - - // Insert label into hypothesis template - const texts = candidate_labels.map( - x => hypothesis_template.replace('{}', x) - ); - - // Run tokenization - const text_inputs = this.tokenizer(texts, { - padding: this.model.config.model_type === 'siglip' ? 'max_length' : true, - truncation: true, - }); - - // Run processor - const { pixel_values } = await this.processor(preparedImages); - - // Run model with both text and pixel inputs - const output = await this.model({ ...text_inputs, pixel_values }); - - const function_to_apply = - this.model.config.model_type === 'siglip' - ? batch => batch.sigmoid().data - : batch => softmax(batch.data); - - // Compare each image with each candidate label - const toReturn = []; - for (const batch of output.logits_per_image) { - // Compute softmax per image - const probs = function_to_apply(batch); - - const result = [...probs].map((x, i) => ({ - score: x, - label: candidate_labels[i] - })); - result.sort((a, b) => b.score - a.score); // sort by score in descending order - toReturn.push(result); - } - - return isBatched ? toReturn : toReturn[0]; - } + return isBatched ? toReturn : toReturn[0]; + } } - /** * @typedef {Object} ObjectDetectionPipelineSingle * @property {string} label The class label identified by the model. * @property {number} score The score attributed by the model for that label. * @property {BoundingBox} box The bounding box of detected object in image's original size, or as a percentage if `percentage` is set to true. * @typedef {ObjectDetectionPipelineSingle[]} ObjectDetectionPipelineOutput - * + * * @typedef {Object} ObjectDetectionPipelineOptions Parameters specific to object detection pipelines. * @property {number} [threshold=0.9] The threshold used to filter boxes by score. * @property {boolean} [percentage=false] Whether to return the boxes coordinates in percentage (true) or in pixels (false). - * + * * @callback ObjectDetectionPipelineCallback Detect objects (bounding boxes & classes) in the image(s) passed as inputs. * @param {ImagePipelineInputs} images The input images. * @param {ObjectDetectionPipelineOptions} [options] The options to use for object detection. - * @returns {Promise} A list of objects or a list of list of objects. - * + * @returns {Promise} A list of objects or a list of list of objects. + * * @typedef {ImagePipelineConstructorArgs & ObjectDetectionPipelineCallback & Disposable} ObjectDetectionPipelineType */ /** * Object detection pipeline using any `AutoModelForObjectDetection`. * This pipeline predicts bounding boxes of objects and their classes. - * + * * **Example:** Run object-detection with `Xenova/detr-resnet-50`. * ```javascript * const detector = await pipeline('object-detection', 'Xenova/detr-resnet-50'); @@ -2141,81 +2212,88 @@ export class ZeroShotImageClassificationPipeline extends (/** @type {new (option * // }] * ``` */ -export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ObjectDetectionPipelineType} */ (Pipeline)) { +export class ObjectDetectionPipeline + extends /** @type {new (options: ImagePipelineConstructorArgs) => ObjectDetectionPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new ObjectDetectionPipeline. + * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new ObjectDetectionPipeline. - * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); + /** @type {ObjectDetectionPipelineCallback} */ + async _call(images, { threshold = 0.9, percentage = false } = {}) { + const isBatched = Array.isArray(images); + + if (isBatched && images.length !== 1) { + throw Error( + "Object detection pipeline currently only supports a batch size of 1.", + ); } + const preparedImages = await prepareImages(images); - /** @type {ObjectDetectionPipelineCallback} */ - async _call(images, { - threshold = 0.9, - percentage = false, - } = {}) { + const imageSizes = percentage + ? null + : preparedImages.map((x) => [x.height, x.width]); - const isBatched = Array.isArray(images); + const { pixel_values, pixel_mask } = await this.processor(preparedImages); + const output = await this.model({ pixel_values, pixel_mask }); - if (isBatched && images.length !== 1) { - throw Error("Object detection pipeline currently only supports a batch size of 1."); - } - const preparedImages = await prepareImages(images); + // @ts-ignore + const processed = + this.processor.feature_extractor.post_process_object_detection( + output, + threshold, + imageSizes, + ); - const imageSizes = percentage ? null : preparedImages.map(x => [x.height, x.width]); + // Add labels + const id2label = this.model.config.id2label; - const { pixel_values, pixel_mask } = await this.processor(preparedImages); - const output = await this.model({ pixel_values, pixel_mask }); + // Format output + /** @type {ObjectDetectionPipelineOutput[]} */ + const result = processed.map((batch) => + batch.boxes.map((box, i) => ({ + score: batch.scores[i], + label: id2label[batch.classes[i]], + box: get_bounding_box(box, !percentage), + })), + ); - // @ts-ignore - const processed = this.processor.feature_extractor.post_process_object_detection(output, threshold, imageSizes); - - // Add labels - const id2label = this.model.config.id2label; - - // Format output - /** @type {ObjectDetectionPipelineOutput[]} */ - const result = processed.map(batch => ( - batch.boxes.map((box, i) => ({ - score: batch.scores[i], - label: id2label[batch.classes[i]], - box: get_bounding_box(box, !percentage), - })) - )) - - return isBatched ? result : result[0]; - } + return isBatched ? result : result[0]; + } } - /** * @typedef {Object} ZeroShotObjectDetectionOutput * @property {string} label Text query corresponding to the found object. * @property {number} score Score corresponding to the object (between 0 and 1). * @property {BoundingBox} box Bounding box of the detected object in image's original size, or as a percentage if `percentage` is set to true. - * + * * @typedef {Object} ZeroShotObjectDetectionPipelineOptions Parameters specific to zero-shot object detection pipelines. * @property {number} [threshold=0.1] The probability necessary to make a prediction. * @property {number} [topk=null] The number of top predictions that will be returned by the pipeline. * If the provided number is `null` or higher than the number of predictions available, it will default * to the number of predictions. * @property {boolean} [percentage=false] Whether to return the boxes coordinates in percentage (true) or in pixels (false). - * + * * @callback ZeroShotObjectDetectionPipelineCallback Detect objects (bounding boxes & classes) in the image(s) passed as inputs. * @param {ImagePipelineInputs} images The input images. * @param {string[]} candidate_labels What the model should recognize in the image. * @param {ZeroShotObjectDetectionPipelineOptions} [options] The options to use for zero-shot object detection. * @returns {Promise} An array of objects containing the predicted labels, scores, and bounding boxes. - * + * * @typedef {TextImagePipelineConstructorArgs & ZeroShotObjectDetectionPipelineCallback & Disposable} ZeroShotObjectDetectionPipelineType */ /** * Zero-shot object detection pipeline. This pipeline predicts bounding boxes of * objects when you provide an image and a set of `candidate_labels`. - * + * * **Example:** Zero-shot object detection w/ `Xenova/owlvit-base-patch32`. * ```javascript * const detector = await pipeline('zero-shot-object-detection', 'Xenova/owlvit-base-patch32'); @@ -2245,7 +2323,7 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe * // } * // ] * ``` - * + * * **Example:** Zero-shot object detection w/ `Xenova/owlvit-base-patch32` (returning top 4 matches and setting a threshold). * ```javascript * const detector = await pipeline('zero-shot-object-detection', 'Xenova/owlvit-base-patch32'); @@ -2276,75 +2354,85 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe * // ] * ``` */ -export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotObjectDetectionPipelineType} */ (Pipeline)) { +export class ZeroShotObjectDetectionPipeline + extends /** @type {new (options: TextImagePipelineConstructorArgs) => ZeroShotObjectDetectionPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new ZeroShotObjectDetectionPipeline. + * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new ZeroShotObjectDetectionPipeline. - * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); + /** @type {ZeroShotObjectDetectionPipelineCallback} */ + async _call( + images, + candidate_labels, + { threshold = 0.1, topk = null, percentage = false } = {}, + ) { + const isBatched = Array.isArray(images); + const preparedImages = await prepareImages(images); + + // Run tokenization + const text_inputs = this.tokenizer(candidate_labels, { + padding: true, + truncation: true, + }); + + // Run processor + const model_inputs = await this.processor(preparedImages); + + // Since non-maximum suppression is performed for exporting, we need to + // process each image separately. For more information, see: + // https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032 + const toReturn = []; + for (let i = 0; i < preparedImages.length; ++i) { + const image = preparedImages[i]; + const imageSize = percentage ? null : [[image.height, image.width]]; + const pixel_values = model_inputs.pixel_values[i].unsqueeze_(0); + + // Run model with both text and pixel inputs + const output = await this.model({ ...text_inputs, pixel_values }); + + // @ts-ignore + const processed = + this.processor.feature_extractor.post_process_object_detection( + output, + threshold, + imageSize, + true, + )[0]; + let result = processed.boxes + .map((box, i) => ({ + score: processed.scores[i], + label: candidate_labels[processed.classes[i]], + box: get_bounding_box(box, !percentage), + })) + .sort((a, b) => b.score - a.score); + if (topk !== null) { + result = result.slice(0, topk); + } + toReturn.push(result); } - /** @type {ZeroShotObjectDetectionPipelineCallback} */ - async _call(images, candidate_labels, { - threshold = 0.1, - topk = null, - percentage = false, - } = {}) { - - const isBatched = Array.isArray(images); - const preparedImages = await prepareImages(images); - - // Run tokenization - const text_inputs = this.tokenizer(candidate_labels, { - padding: true, - truncation: true, - }); - - // Run processor - const model_inputs = await this.processor(preparedImages); - - // Since non-maximum suppression is performed for exporting, we need to - // process each image separately. For more information, see: - // https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032 - const toReturn = []; - for (let i = 0; i < preparedImages.length; ++i) { - const image = preparedImages[i]; - const imageSize = percentage ? null : [[image.height, image.width]]; - const pixel_values = model_inputs.pixel_values[i].unsqueeze_(0); - - // Run model with both text and pixel inputs - const output = await this.model({ ...text_inputs, pixel_values }); - - // @ts-ignore - const processed = this.processor.feature_extractor.post_process_object_detection(output, threshold, imageSize, true)[0]; - let result = processed.boxes.map((box, i) => ({ - score: processed.scores[i], - label: candidate_labels[processed.classes[i]], - box: get_bounding_box(box, !percentage), - })).sort((a, b) => b.score - a.score); - if (topk !== null) { - result = result.slice(0, topk); - } - toReturn.push(result) - } - - return isBatched ? toReturn : toReturn[0]; - } + return isBatched ? toReturn : toReturn[0]; + } } /** * @typedef {Object} DocumentQuestionAnsweringSingle * @property {string} answer The generated text. * @typedef {DocumentQuestionAnsweringSingle[]} DocumentQuestionAnsweringOutput - * + * * @callback DocumentQuestionAnsweringPipelineCallback Answer the question given as input by using the document. * @param {ImageInput} image The image of the document to use. * @param {string} question A question to ask of the document. * @param {import('./utils/generation.js').GenerationConfigType} [options] Additional keyword arguments to pass along to the generate method of the model. * @returns {Promise} An object (or array of objects) containing the answer(s). - * + * * @typedef {TextImagePipelineConstructorArgs & DocumentQuestionAnsweringPipelineCallback & Disposable} DocumentQuestionAnsweringPipelineType */ @@ -2352,7 +2440,7 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T * Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. * The inputs/outputs are similar to the (extractive) question answering pipeline; however, * the pipeline takes an image (and optional OCR'd words/boxes) as input instead of text context. - * + * * **Example:** Answer questions about a document with `Xenova/donut-base-finetuned-docvqa`. * ```javascript * const qa_pipeline = await pipeline('document-question-answering', 'Xenova/donut-base-finetuned-docvqa'); @@ -2362,57 +2450,55 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T * // [{ answer: 'us-001' }] * ``` */ -export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => DocumentQuestionAnsweringPipelineType} */ (Pipeline)) { +export class DocumentQuestionAnsweringPipeline + extends /** @type {new (options: TextImagePipelineConstructorArgs) => DocumentQuestionAnsweringPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new DocumentQuestionAnsweringPipeline. + * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } - /** - * Create a new DocumentQuestionAnsweringPipeline. - * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); - } - - /** @type {DocumentQuestionAnsweringPipelineCallback} */ - async _call(image, question, generate_kwargs = {}) { - - // NOTE: For now, we only support a batch size of 1 - - // Preprocess image - const preparedImage = (await prepareImages(image))[0]; - const { pixel_values } = await this.processor(preparedImage); - - // Run tokenization - const task_prompt = `${question}`; - const decoder_input_ids = this.tokenizer(task_prompt, { - add_special_tokens: false, - padding: true, - truncation: true, - }).input_ids; - - // Run model - const output = await this.model.generate( - pixel_values, - { - ...generate_kwargs, - decoder_input_ids, - max_length: this.model.config.decoder.max_position_embeddings, - } - ); - - // Decode output - const decoded = this.tokenizer.batch_decode(output)[0]; - - // Parse answer - const match = decoded.match(/(.*?)<\/s_answer>/); - let answer = null; - if (match && match.length >= 2) { - answer = match[1].trim(); - } - return [{ answer }]; + /** @type {DocumentQuestionAnsweringPipelineCallback} */ + async _call(image, question, generate_kwargs = {}) { + // NOTE: For now, we only support a batch size of 1 + + // Preprocess image + const preparedImage = (await prepareImages(image))[0]; + const { pixel_values } = await this.processor(preparedImage); + + // Run tokenization + const task_prompt = `${question}`; + const decoder_input_ids = this.tokenizer(task_prompt, { + add_special_tokens: false, + padding: true, + truncation: true, + }).input_ids; + + // Run model + const output = await this.model.generate(pixel_values, { + ...generate_kwargs, + decoder_input_ids, + max_length: this.model.config.decoder.max_position_embeddings, + }); + + // Decode output + const decoded = this.tokenizer.batch_decode(output)[0]; + + // Parse answer + const match = decoded.match(/(.*?)<\/s_answer>/); + let answer = null; + if (match && match.length >= 2) { + answer = match[1].trim(); } + return [{ answer }]; + } } - /** * @typedef {Object} VocoderOptions * @property {PreTrainedModel} [vocoder] The vocoder used by the pipeline (if the model uses one). If not provided, use the default HifiGan vocoder. @@ -2423,22 +2509,22 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: * @typedef {Object} TextToAudioOutput * @property {Float32Array} audio The generated audio waveform. * @property {number} sampling_rate The sampling rate of the generated audio waveform. - * + * * @typedef {Object} TextToAudioPipelineOptions Parameters specific to text-to-audio pipelines. * @property {Tensor|Float32Array|string|URL} [speaker_embeddings=null] The speaker embeddings (if the model requires it). - * + * * @callback TextToAudioPipelineCallback Generates speech/audio from the inputs. * @param {string|string[]} texts The text(s) to generate. * @param {TextToAudioPipelineOptions} options Parameters passed to the model generation/forward method. * @returns {Promise} An object containing the generated audio and sampling rate. - * + * * @typedef {TextToAudioPipelineConstructorArgs & TextToAudioPipelineCallback & Disposable} TextToAudioPipelineType */ /** * Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. * This pipeline generates an audio file from an input text and optional other conditional inputs. - * + * * **Example:** Generate audio from text with `Xenova/speecht5_tts`. * ```javascript * const synthesizer = await pipeline('text-to-speech', 'Xenova/speecht5_tts', { quantized: false }); @@ -2449,17 +2535,17 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: * // sampling_rate: 16000 * // } * ``` - * + * * You can then save the audio to a .wav file with the `wavefile` package: * ```javascript * import wavefile from 'wavefile'; * import fs from 'fs'; - * + * * const wav = new wavefile.WaveFile(); * wav.fromScratch(1, out.sampling_rate, '32f', out.audio); * fs.writeFileSync('out.wav', wav.toBuffer()); * ``` - * + * * **Example:** Multilingual speech generation with `Xenova/mms-tts-fra`. See [here](https://huggingface.co/models?pipeline_tag=text-to-speech&other=vits&sort=trending) for the full list of available languages (1107). * ```javascript * const synthesizer = await pipeline('text-to-speech', 'Xenova/mms-tts-fra'); @@ -2470,107 +2556,117 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: * // } * ``` */ -export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineConstructorArgs) => TextToAudioPipelineType} */ (Pipeline)) { - DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan" +export class TextToAudioPipeline + extends /** @type {new (options: TextToAudioPipelineConstructorArgs) => TextToAudioPipelineType} */ ( + Pipeline + ) +{ + DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"; - /** - * Create a new TextToAudioPipeline. - * @param {TextToAudioPipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); + /** + * Create a new TextToAudioPipeline. + * @param {TextToAudioPipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); - // TODO: Find a better way for `pipeline` to set the default vocoder - this.vocoder = options.vocoder ?? null; + // TODO: Find a better way for `pipeline` to set the default vocoder + this.vocoder = options.vocoder ?? null; + } + + /** @type {TextToAudioPipelineCallback} */ + async _call(text_inputs, { speaker_embeddings = null } = {}) { + // If this.processor is not set, we are using a `AutoModelForTextToWaveform` model + if (this.processor) { + return this._call_text_to_spectrogram(text_inputs, { + speaker_embeddings, + }); + } else { + return this._call_text_to_waveform(text_inputs); + } + } + + async _call_text_to_waveform(text_inputs) { + // Run tokenization + const inputs = this.tokenizer(text_inputs, { + padding: true, + truncation: true, + }); + + // Generate waveform + const { waveform } = await this.model(inputs); + + const sampling_rate = this.model.config.sampling_rate; + return { + audio: waveform.data, + sampling_rate, + }; + } + + async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) { + // Load vocoder, if not provided + if (!this.vocoder) { + console.log("No vocoder specified, using default HifiGan vocoder."); + this.vocoder = await AutoModel.from_pretrained(this.DEFAULT_VOCODER_ID, { + quantized: false, + }); } - - /** @type {TextToAudioPipelineCallback} */ - async _call(text_inputs, { - speaker_embeddings = null, - } = {}) { - - // If this.processor is not set, we are using a `AutoModelForTextToWaveform` model - if (this.processor) { - return this._call_text_to_spectrogram(text_inputs, { speaker_embeddings }); - } else { - return this._call_text_to_waveform(text_inputs); - } + // Load speaker embeddings as Float32Array from path/URL + if ( + typeof speaker_embeddings === "string" || + speaker_embeddings instanceof URL + ) { + // Load from URL with fetch + speaker_embeddings = new Float32Array( + await (await fetch(speaker_embeddings)).arrayBuffer(), + ); } - async _call_text_to_waveform(text_inputs) { - - // Run tokenization - const inputs = this.tokenizer(text_inputs, { - padding: true, - truncation: true, - }); - - // Generate waveform - const { waveform } = await this.model(inputs); - - const sampling_rate = this.model.config.sampling_rate; - return { - audio: waveform.data, - sampling_rate, - } + if (speaker_embeddings instanceof Float32Array) { + speaker_embeddings = new Tensor("float32", speaker_embeddings, [ + 1, + speaker_embeddings.length, + ]); + } else if (!(speaker_embeddings instanceof Tensor)) { + throw new Error( + "Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.", + ); } - async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) { + // Run tokenization + const { input_ids } = this.tokenizer(text_inputs, { + padding: true, + truncation: true, + }); - // Load vocoder, if not provided - if (!this.vocoder) { - console.log('No vocoder specified, using default HifiGan vocoder.'); - this.vocoder = await AutoModel.from_pretrained(this.DEFAULT_VOCODER_ID, { quantized: false }); - } + // NOTE: At this point, we are guaranteed that `speaker_embeddings` is a `Tensor` + // @ts-ignore + const { waveform } = await this.model.generate_speech( + input_ids, + speaker_embeddings, + { vocoder: this.vocoder }, + ); - // Load speaker embeddings as Float32Array from path/URL - if (typeof speaker_embeddings === 'string' || speaker_embeddings instanceof URL) { - // Load from URL with fetch - speaker_embeddings = new Float32Array( - await (await fetch(speaker_embeddings)).arrayBuffer() - ); - } - - if (speaker_embeddings instanceof Float32Array) { - speaker_embeddings = new Tensor( - 'float32', - speaker_embeddings, - [1, speaker_embeddings.length] - ) - } else if (!(speaker_embeddings instanceof Tensor)) { - throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.") - } - - // Run tokenization - const { input_ids } = this.tokenizer(text_inputs, { - padding: true, - truncation: true, - }); - - // NOTE: At this point, we are guaranteed that `speaker_embeddings` is a `Tensor` - // @ts-ignore - const { waveform } = await this.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder }); - - const sampling_rate = this.processor.feature_extractor.config.sampling_rate; - return { - audio: waveform.data, - sampling_rate, - } - } + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; + return { + audio: waveform.data, + sampling_rate, + }; + } } /** * @callback ImageToImagePipelineCallback Transform the image(s) passed as inputs. * @param {ImagePipelineInputs} images The images to transform. * @returns {Promise} The transformed image or list of images. - * + * * @typedef {ImagePipelineConstructorArgs & ImageToImagePipelineCallback & Disposable} ImageToImagePipelineType */ /** * Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous image input. - * + * * **Example:** Super-resolution w/ `Xenova/swin2SR-classical-sr-x2-64` * ```javascript * const upscaler = await pipeline('image-to-image', 'Xenova/swin2SR-classical-sr-x2-64'); @@ -2584,48 +2680,56 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi * // } * ``` */ -export class ImageToImagePipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageToImagePipelineType} */ (Pipeline)) { - /** - * Create a new ImageToImagePipeline. - * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); +export class ImageToImagePipeline + extends /** @type {new (options: ImagePipelineConstructorArgs) => ImageToImagePipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new ImageToImagePipeline. + * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } + + /** @type {ImageToImagePipelineCallback} */ + async _call(images) { + const preparedImages = await prepareImages(images); + const inputs = await this.processor(preparedImages); + const outputs = await this.model(inputs); + + /** @type {RawImage[]} */ + const toReturn = []; + for (const batch of outputs.reconstruction) { + const output = batch + .squeeze() + .clamp_(0, 1) + .mul_(255) + .round_() + .to("uint8"); + // toReturn.push(RawImage.fromTensor(output)); } - /** @type {ImageToImagePipelineCallback} */ - async _call(images) { - - const preparedImages = await prepareImages(images); - const inputs = await this.processor(preparedImages); - const outputs = await this.model(inputs); - - /** @type {RawImage[]} */ - const toReturn = []; - for (const batch of outputs.reconstruction) { - const output = batch.squeeze().clamp_(0, 1).mul_(255).round_().to('uint8'); - // toReturn.push(RawImage.fromTensor(output)); - } - - return toReturn.length > 1 ? toReturn : toReturn[0]; - } + return toReturn.length > 1 ? toReturn : toReturn[0]; + } } /** * @typedef {Object} DepthEstimationPipelineOutput * @property {Tensor} predicted_depth The raw depth map predicted by the model. * @property {RawImage} depth The processed depth map as an image (with the same size as the input image). - * + * * @callback DepthEstimationPipelineCallback Predicts the depth for the image(s) passed as inputs. * @param {ImagePipelineInputs} images The images to compute depth for. * @returns {Promise} An image or a list of images containing result(s). - * + * * @typedef {ImagePipelineConstructorArgs & DepthEstimationPipelineCallback & Disposable} DepthEstimationPipelineType */ /** * Depth estimation pipeline using any `AutoModelForDepthEstimation`. This pipeline predicts the depth of an image. - * + * * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas` * ```javascript * const depth_estimator = await pipeline('depth-estimation', 'Xenova/dpt-hybrid-midas'); @@ -2647,323 +2751,332 @@ export class ImageToImagePipeline extends (/** @type {new (options: ImagePipelin * // } * ``` */ -export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => DepthEstimationPipelineType} */ (Pipeline)) { - /** - * Create a new DepthEstimationPipeline. - * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. - */ - constructor(options) { - super(options); +export class DepthEstimationPipeline + extends /** @type {new (options: ImagePipelineConstructorArgs) => DepthEstimationPipelineType} */ ( + Pipeline + ) +{ + /** + * Create a new DepthEstimationPipeline. + * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } + + /** @type {DepthEstimationPipelineCallback} */ + async _call(images) { + const preparedImages = await prepareImages(images); + + const inputs = await this.processor(preparedImages); + const { predicted_depth } = await this.model(inputs); + + const toReturn = []; + for (let i = 0; i < preparedImages.length; ++i) { + const prediction = interpolate( + predicted_depth[i], + preparedImages[i].size.reverse(), + "bilinear", + false, + ); + const formatted = prediction + .mul_(255 / max(prediction.data)[0]) + .to("uint8"); + toReturn.push({ + predicted_depth: predicted_depth[i], + // depth: RawImage.fromTensor(formatted), + }); } - /** @type {DepthEstimationPipelineCallback} */ - async _call(images) { - - const preparedImages = await prepareImages(images); - - const inputs = await this.processor(preparedImages); - const { predicted_depth } = await this.model(inputs); - - const toReturn = []; - for (let i = 0; i < preparedImages.length; ++i) { - const prediction = interpolate(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false); - const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8'); - toReturn.push({ - predicted_depth: predicted_depth[i], - // depth: RawImage.fromTensor(formatted), - }); - } - - return toReturn.length > 1 ? toReturn : toReturn[0]; - } + return toReturn.length > 1 ? toReturn : toReturn[0]; + } } const SUPPORTED_TASKS = Object.freeze({ - "text-classification": { - "tokenizer": AutoTokenizer, - "pipeline": TextClassificationPipeline, - "model": AutoModelForSequenceClassification, - "default": { - // TODO: replace with original - // "model": "distilbert-base-uncased-finetuned-sst-2-english", - "model": "Xenova/distilbert-base-uncased-finetuned-sst-2-english", - }, - "type": "text", + "text-classification": { + tokenizer: AutoTokenizer, + pipeline: TextClassificationPipeline, + model: AutoModelForSequenceClassification, + default: { + // TODO: replace with original + // "model": "distilbert-base-uncased-finetuned-sst-2-english", + model: "Xenova/distilbert-base-uncased-finetuned-sst-2-english", }, - "token-classification": { - "tokenizer": AutoTokenizer, - "pipeline": TokenClassificationPipeline, - "model": AutoModelForTokenClassification, - "default": { - // TODO: replace with original - // "model": "Davlan/bert-base-multilingual-cased-ner-hrl", - "model": "Xenova/bert-base-multilingual-cased-ner-hrl", - }, - "type": "text", + type: "text", + }, + "token-classification": { + tokenizer: AutoTokenizer, + pipeline: TokenClassificationPipeline, + model: AutoModelForTokenClassification, + default: { + // TODO: replace with original + // "model": "Davlan/bert-base-multilingual-cased-ner-hrl", + model: "Xenova/bert-base-multilingual-cased-ner-hrl", }, - "question-answering": { - "tokenizer": AutoTokenizer, - "pipeline": QuestionAnsweringPipeline, - "model": AutoModelForQuestionAnswering, - "default": { - // TODO: replace with original - // "model": "distilbert-base-cased-distilled-squad", - "model": "Xenova/distilbert-base-cased-distilled-squad", - }, - "type": "text", + type: "text", + }, + "question-answering": { + tokenizer: AutoTokenizer, + pipeline: QuestionAnsweringPipeline, + model: AutoModelForQuestionAnswering, + default: { + // TODO: replace with original + // "model": "distilbert-base-cased-distilled-squad", + model: "Xenova/distilbert-base-cased-distilled-squad", }, + type: "text", + }, - "fill-mask": { - "tokenizer": AutoTokenizer, - "pipeline": FillMaskPipeline, - "model": AutoModelForMaskedLM, - "default": { - // TODO: replace with original - // "model": "bert-base-uncased", - "model": "Xenova/bert-base-uncased", - }, - "type": "text", + "fill-mask": { + tokenizer: AutoTokenizer, + pipeline: FillMaskPipeline, + model: AutoModelForMaskedLM, + default: { + // TODO: replace with original + // "model": "bert-base-uncased", + model: "Xenova/bert-base-uncased", }, - "summarization": { - "tokenizer": AutoTokenizer, - "pipeline": SummarizationPipeline, - "model": AutoModelForSeq2SeqLM, - "default": { - // TODO: replace with original - // "model": "sshleifer/distilbart-cnn-6-6", - "model": "Xenova/distilbart-cnn-6-6", - }, - "type": "text", + type: "text", + }, + summarization: { + tokenizer: AutoTokenizer, + pipeline: SummarizationPipeline, + model: AutoModelForSeq2SeqLM, + default: { + // TODO: replace with original + // "model": "sshleifer/distilbart-cnn-6-6", + model: "Xenova/distilbart-cnn-6-6", }, - "translation": { - "tokenizer": AutoTokenizer, - "pipeline": TranslationPipeline, - "model": AutoModelForSeq2SeqLM, - "default": { - // TODO: replace with original - // "model": "t5-small", - "model": "Xenova/t5-small", - }, - "type": "text", + type: "text", + }, + translation: { + tokenizer: AutoTokenizer, + pipeline: TranslationPipeline, + model: AutoModelForSeq2SeqLM, + default: { + // TODO: replace with original + // "model": "t5-small", + model: "Xenova/t5-small", }, - "text2text-generation": { - "tokenizer": AutoTokenizer, - "pipeline": Text2TextGenerationPipeline, - "model": AutoModelForSeq2SeqLM, - "default": { - // TODO: replace with original - // "model": "google/flan-t5-small", - "model": "Xenova/flan-t5-small", - }, - "type": "text", + type: "text", + }, + "text2text-generation": { + tokenizer: AutoTokenizer, + pipeline: Text2TextGenerationPipeline, + model: AutoModelForSeq2SeqLM, + default: { + // TODO: replace with original + // "model": "google/flan-t5-small", + model: "Xenova/flan-t5-small", }, - "text-generation": { - "tokenizer": AutoTokenizer, - "pipeline": TextGenerationPipeline, - "model": AutoModelForCausalLM, - "default": { - // TODO: replace with original - // "model": "gpt2", - "model": "Xenova/gpt2", - }, - "type": "text", + type: "text", + }, + "text-generation": { + tokenizer: AutoTokenizer, + pipeline: TextGenerationPipeline, + model: AutoModelForCausalLM, + default: { + // TODO: replace with original + // "model": "gpt2", + model: "Xenova/gpt2", }, - "zero-shot-classification": { - "tokenizer": AutoTokenizer, - "pipeline": ZeroShotClassificationPipeline, - "model": AutoModelForSequenceClassification, - "default": { - // TODO: replace with original - // "model": "typeform/distilbert-base-uncased-mnli", - "model": "Xenova/distilbert-base-uncased-mnli", - }, - "type": "text", + type: "text", + }, + "zero-shot-classification": { + tokenizer: AutoTokenizer, + pipeline: ZeroShotClassificationPipeline, + model: AutoModelForSequenceClassification, + default: { + // TODO: replace with original + // "model": "typeform/distilbert-base-uncased-mnli", + model: "Xenova/distilbert-base-uncased-mnli", }, - "audio-classification": { - "pipeline": AudioClassificationPipeline, - "model": AutoModelForAudioClassification, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "superb/wav2vec2-base-superb-ks", - "model": "Xenova/wav2vec2-base-superb-ks", - }, - "type": "audio", + type: "text", + }, + "audio-classification": { + pipeline: AudioClassificationPipeline, + model: AutoModelForAudioClassification, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "superb/wav2vec2-base-superb-ks", + model: "Xenova/wav2vec2-base-superb-ks", }, - "zero-shot-audio-classification": { - "tokenizer": AutoTokenizer, - "pipeline": ZeroShotAudioClassificationPipeline, - "model": AutoModel, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "laion/clap-htsat-fused", - "model": "Xenova/clap-htsat-unfused", - }, - "type": "multimodal", + type: "audio", + }, + "zero-shot-audio-classification": { + tokenizer: AutoTokenizer, + pipeline: ZeroShotAudioClassificationPipeline, + model: AutoModel, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "laion/clap-htsat-fused", + model: "Xenova/clap-htsat-unfused", }, - "automatic-speech-recognition": { - "tokenizer": AutoTokenizer, - "pipeline": AutomaticSpeechRecognitionPipeline, - "model": [AutoModelForSpeechSeq2Seq, AutoModelForCTC], - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "openai/whisper-tiny.en", - "model": "Xenova/whisper-tiny.en", - }, - "type": "multimodal", + type: "multimodal", + }, + "automatic-speech-recognition": { + tokenizer: AutoTokenizer, + pipeline: AutomaticSpeechRecognitionPipeline, + model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC], + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "openai/whisper-tiny.en", + model: "Xenova/whisper-tiny.en", }, - "text-to-audio": { - "tokenizer": AutoTokenizer, - "pipeline": TextToAudioPipeline, - "model": [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram], - "processor": [AutoProcessor, /* Some don't use a processor */ null], - "default": { - // TODO: replace with original - // "model": "microsoft/speecht5_tts", - "model": "Xenova/speecht5_tts", - }, - "type": "text", + type: "multimodal", + }, + "text-to-audio": { + tokenizer: AutoTokenizer, + pipeline: TextToAudioPipeline, + model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram], + processor: [AutoProcessor, /* Some don't use a processor */ null], + default: { + // TODO: replace with original + // "model": "microsoft/speecht5_tts", + model: "Xenova/speecht5_tts", }, - "image-to-text": { - "tokenizer": AutoTokenizer, - "pipeline": ImageToTextPipeline, - "model": AutoModelForVision2Seq, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "nlpconnect/vit-gpt2-image-captioning", - "model": "Xenova/vit-gpt2-image-captioning", - }, - "type": "multimodal", + type: "text", + }, + "image-to-text": { + tokenizer: AutoTokenizer, + pipeline: ImageToTextPipeline, + model: AutoModelForVision2Seq, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "nlpconnect/vit-gpt2-image-captioning", + model: "Xenova/vit-gpt2-image-captioning", }, + type: "multimodal", + }, - "image-classification": { - // no tokenizer - "pipeline": ImageClassificationPipeline, - "model": AutoModelForImageClassification, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "google/vit-base-patch16-224", - "model": "Xenova/vit-base-patch16-224", - }, - "type": "multimodal", + "image-classification": { + // no tokenizer + pipeline: ImageClassificationPipeline, + model: AutoModelForImageClassification, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "google/vit-base-patch16-224", + model: "Xenova/vit-base-patch16-224", }, + type: "multimodal", + }, - "image-segmentation": { - // no tokenizer - "pipeline": ImageSegmentationPipeline, - "model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation], - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "facebook/detr-resnet-50-panoptic", - "model": "Xenova/detr-resnet-50-panoptic", - }, - "type": "multimodal", + "image-segmentation": { + // no tokenizer + pipeline: ImageSegmentationPipeline, + model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation], + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "facebook/detr-resnet-50-panoptic", + model: "Xenova/detr-resnet-50-panoptic", }, + type: "multimodal", + }, - "zero-shot-image-classification": { - "tokenizer": AutoTokenizer, - "pipeline": ZeroShotImageClassificationPipeline, - "model": AutoModel, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "openai/clip-vit-base-patch32", - "model": "Xenova/clip-vit-base-patch32", - }, - "type": "multimodal", + "zero-shot-image-classification": { + tokenizer: AutoTokenizer, + pipeline: ZeroShotImageClassificationPipeline, + model: AutoModel, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "openai/clip-vit-base-patch32", + model: "Xenova/clip-vit-base-patch32", }, + type: "multimodal", + }, - "object-detection": { - // no tokenizer - "pipeline": ObjectDetectionPipeline, - "model": AutoModelForObjectDetection, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "facebook/detr-resnet-50", - "model": "Xenova/detr-resnet-50", - }, - "type": "multimodal", + "object-detection": { + // no tokenizer + pipeline: ObjectDetectionPipeline, + model: AutoModelForObjectDetection, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "facebook/detr-resnet-50", + model: "Xenova/detr-resnet-50", }, - "zero-shot-object-detection": { - "tokenizer": AutoTokenizer, - "pipeline": ZeroShotObjectDetectionPipeline, - "model": AutoModelForZeroShotObjectDetection, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "google/owlvit-base-patch32", - "model": "Xenova/owlvit-base-patch32", - }, - "type": "multimodal", + type: "multimodal", + }, + "zero-shot-object-detection": { + tokenizer: AutoTokenizer, + pipeline: ZeroShotObjectDetectionPipeline, + model: AutoModelForZeroShotObjectDetection, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "google/owlvit-base-patch32", + model: "Xenova/owlvit-base-patch32", }, - "document-question-answering": { - "tokenizer": AutoTokenizer, - "pipeline": DocumentQuestionAnsweringPipeline, - "model": AutoModelForDocumentQuestionAnswering, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "naver-clova-ix/donut-base-finetuned-docvqa", - "model": "Xenova/donut-base-finetuned-docvqa", - }, - "type": "multimodal", + type: "multimodal", + }, + "document-question-answering": { + tokenizer: AutoTokenizer, + pipeline: DocumentQuestionAnsweringPipeline, + model: AutoModelForDocumentQuestionAnswering, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "naver-clova-ix/donut-base-finetuned-docvqa", + model: "Xenova/donut-base-finetuned-docvqa", }, - "image-to-image": { - // no tokenizer - "pipeline": ImageToImagePipeline, - "model": AutoModelForImageToImage, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "caidas/swin2SR-classical-sr-x2-64", - "model": "Xenova/swin2SR-classical-sr-x2-64", - }, - "type": "image", + type: "multimodal", + }, + "image-to-image": { + // no tokenizer + pipeline: ImageToImagePipeline, + model: AutoModelForImageToImage, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "caidas/swin2SR-classical-sr-x2-64", + model: "Xenova/swin2SR-classical-sr-x2-64", }, - "depth-estimation": { - // no tokenizer - "pipeline": DepthEstimationPipeline, - "model": AutoModelForDepthEstimation, - "processor": AutoProcessor, - "default": { - // TODO: replace with original - // "model": "Intel/dpt-large", - "model": "Xenova/dpt-large", - }, - "type": "image", + type: "image", + }, + "depth-estimation": { + // no tokenizer + pipeline: DepthEstimationPipeline, + model: AutoModelForDepthEstimation, + processor: AutoProcessor, + default: { + // TODO: replace with original + // "model": "Intel/dpt-large", + model: "Xenova/dpt-large", }, + type: "image", + }, - // This task serves as a useful interface for dealing with sentence-transformers (https://huggingface.co/sentence-transformers). - "feature-extraction": { - "tokenizer": AutoTokenizer, - "pipeline": FeatureExtractionPipeline, - "model": AutoModel, - "default": { - // TODO: replace with original - // "model": "sentence-transformers/all-MiniLM-L6-v2", - "model": "Xenova/all-MiniLM-L6-v2", - }, - "type": "text", + // This task serves as a useful interface for dealing with sentence-transformers (https://huggingface.co/sentence-transformers). + "feature-extraction": { + tokenizer: AutoTokenizer, + pipeline: FeatureExtractionPipeline, + model: AutoModel, + default: { + // TODO: replace with original + // "model": "sentence-transformers/all-MiniLM-L6-v2", + model: "Xenova/all-MiniLM-L6-v2", }, -}) - + type: "text", + }, +}); // TODO: Add types for TASK_ALIASES const TASK_ALIASES = Object.freeze({ - "sentiment-analysis": "text-classification", - "ner": "token-classification", - // "vqa": "visual-question-answering", // TODO: Add - "asr": "automatic-speech-recognition", - "text-to-speech": "text-to-audio", + "sentiment-analysis": "text-classification", + ner: "token-classification", + // "vqa": "visual-question-answering", // TODO: Add + asr: "automatic-speech-recognition", + "text-to-speech": "text-to-audio", - // Add for backwards compatibility - "embeddings": "feature-extraction", + // Add for backwards compatibility + embeddings: "feature-extraction", }); /** @@ -2977,7 +3090,7 @@ const TASK_ALIASES = Object.freeze({ /** * Utility factory method to build a `Pipeline` object. - * + * * @template {PipelineType} T The type of pipeline to return. * @param {T} task The task defining which pipeline will be returned. Currently accepted tasks are: * - `"audio-classification"`: will return a `AudioClassificationPipeline`. @@ -3008,65 +3121,66 @@ const TASK_ALIASES = Object.freeze({ * @throws {Error} If an unsupported pipeline is requested. */ export async function pipeline( - task, - model = null, - { - quantized = true, - progress_callback = null, - config = null, - cache_dir = null, - local_files_only = false, - revision = 'main', - } = {} + task, + model = null, + { + quantized = true, + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = "main", + } = {}, ) { - // Helper method to construct pipeline + // Helper method to construct pipeline - // Apply aliases - // @ts-ignore - task = TASK_ALIASES[task] ?? task; + // Apply aliases + // @ts-ignore + task = TASK_ALIASES[task] ?? task; - // Get pipeline info - const pipelineInfo = SUPPORTED_TASKS[task.split('_', 1)[0]]; - if (!pipelineInfo) { - throw Error(`Unsupported pipeline: ${task}. Must be one of [${Object.keys(SUPPORTED_TASKS)}]`) - } + // Get pipeline info + const pipelineInfo = SUPPORTED_TASKS[task.split("_", 1)[0]]; + if (!pipelineInfo) { + throw Error( + `Unsupported pipeline: ${task}. Must be one of [${Object.keys(SUPPORTED_TASKS)}]`, + ); + } - // Use model if specified, otherwise, use default - if (!model) { - model = pipelineInfo.default.model - console.log(`No model specified. Using default model: "${model}".`); - } + // Use model if specified, otherwise, use default + if (!model) { + model = pipelineInfo.default.model; + console.log(`No model specified. Using default model: "${model}".`); + } - const pretrainedOptions = { - quantized, - progress_callback, - config, - cache_dir, - local_files_only, - revision, - } + const pretrainedOptions = { + quantized, + progress_callback, + config, + cache_dir, + local_files_only, + revision, + }; - const classes = new Map([ - ['tokenizer', pipelineInfo.tokenizer], - ['model', pipelineInfo.model], - ['processor', pipelineInfo.processor], - ]); + const classes = new Map([ + ["tokenizer", pipelineInfo.tokenizer], + ["model", pipelineInfo.model], + ["processor", pipelineInfo.processor], + ]); - // Load model, tokenizer, and processor (if they exist) - const results = await loadItems(classes, model, pretrainedOptions); - results.task = task; + // Load model, tokenizer, and processor (if they exist) + const results = await loadItems(classes, model, pretrainedOptions); + results.task = task; - dispatchCallback(progress_callback, { - 'status': 'ready', - 'task': task, - 'model': model, - }); + dispatchCallback(progress_callback, { + status: "ready", + task: task, + model: model, + }); - const pipelineClass = pipelineInfo.pipeline; - return new pipelineClass(results); + const pipelineClass = pipelineInfo.pipeline; + return new pipelineClass(results); } - /** * Helper function to get applicable model, tokenizer, or processor classes for a given model. * @param {Map} mapping The mapping of names to classes, arrays of classes, or null. @@ -3075,50 +3189,49 @@ export async function pipeline( * @private */ async function loadItems(mapping, model, pretrainedOptions) { + const result = Object.create(null); - const result = Object.create(null); + /**@type {Promise[]} */ + const promises = []; + for (let [name, cls] of mapping.entries()) { + if (!cls) continue; - /**@type {Promise[]} */ - const promises = []; - for (let [name, cls] of mapping.entries()) { - if (!cls) continue; - - /**@type {Promise} */ - let promise; - if (Array.isArray(cls)) { - promise = new Promise(async (resolve, reject) => { - let e; - for (let c of cls) { - if (c === null) { - // If null, we resolve it immediately, meaning the relevant - // class was not found, but it is optional. - resolve(null); - return; - } - try { - resolve(await c.from_pretrained(model, pretrainedOptions)); - return; - } catch (err) { - e = err; - } - } - reject(e); - }) - } else { - promise = cls.from_pretrained(model, pretrainedOptions); + /**@type {Promise} */ + let promise; + if (Array.isArray(cls)) { + promise = new Promise(async (resolve, reject) => { + let e; + for (let c of cls) { + if (c === null) { + // If null, we resolve it immediately, meaning the relevant + // class was not found, but it is optional. + resolve(null); + return; + } + try { + resolve(await c.from_pretrained(model, pretrainedOptions)); + return; + } catch (err) { + e = err; + } } - - result[name] = promise; - promises.push(promise); + reject(e); + }); + } else { + promise = cls.from_pretrained(model, pretrainedOptions); } - // Wait for all promises to resolve (in parallel) - await Promise.all(promises); + result[name] = promise; + promises.push(promise); + } - // Then assign to result - for (let [name, promise] of Object.entries(result)) { - result[name] = await promise; - } + // Wait for all promises to resolve (in parallel) + await Promise.all(promises); - return result; -} \ No newline at end of file + // Then assign to result + for (let [name, promise] of Object.entries(result)) { + result[name] = await promise; + } + + return result; +} diff --git a/core/vendor/modules/@xenova/transformers/src/processors.js b/core/vendor/modules/@xenova/transformers/src/processors.js index 37e70c197..1bdaf807b 100644 --- a/core/vendor/modules/@xenova/transformers/src/processors.js +++ b/core/vendor/modules/@xenova/transformers/src/processors.js @@ -1,7 +1,6 @@ - /** * @file Processors are used to prepare non-textual inputs (e.g., image or audio) for a model. - * + * * **Example:** Using a `WhisperProcessor` to prepare an audio input for a model. * ```javascript * import { AutoProcessor, read_audio } from '@xenova/transformers'; @@ -16,51 +15,43 @@ * // size: 240000, * // } * ``` - * + * * @module processors */ import { - Callable, - calculateDimensions, - calculateReflectOffset, -} from './utils/core.js'; + Callable, + calculateDimensions, + calculateReflectOffset, +} from "./utils/core.js"; -import { - getModelJSON, -} from './utils/hub.js'; +import { getModelJSON } from "./utils/hub.js"; -import { - max, - min, - softmax, -} from './utils/maths.js'; +import { max, min, softmax } from "./utils/maths.js"; - -import { Tensor, cat, interpolate, stack, transpose } from './utils/tensor.js'; +import { Tensor, cat, interpolate, stack, transpose } from "./utils/tensor.js"; // import { RawImage } from './utils/image.js'; import { - mel_filter_bank, - spectrogram, - window_function, -} from './utils/audio.js'; - + mel_filter_bank, + spectrogram, + window_function, +} from "./utils/audio.js"; // Helper functions /** * Converts bounding boxes from center format to corners format. - * + * * @param {number[]} arr The coordinate for the center of the box and its width, height dimensions (center_x, center_y, width, height) * @returns {number[]} The coodinates for the top-left and bottom-right corners of the box (top_left_x, top_left_y, bottom_right_x, bottom_right_y) */ function center_to_corners_format([centerX, centerY, width, height]) { - return [ - centerX - width / 2, - centerY - height / 2, - centerX + width / 2, - centerY + height / 2 - ]; + return [ + centerX - width / 2, + centerY - height / 2, + centerX + width / 2, + centerY + height / 2, + ]; } /** @@ -74,73 +65,78 @@ function center_to_corners_format([centerX, centerY, width, height]) { * @return {Object[]} An array of objects containing the post-processed outputs. * @private */ -function post_process_object_detection(outputs, threshold = 0.5, target_sizes = null, is_zero_shot = false) { - const out_logits = outputs.logits; - const out_bbox = outputs.pred_boxes; - const [batch_size, num_boxes, num_classes] = out_logits.dims; +function post_process_object_detection( + outputs, + threshold = 0.5, + target_sizes = null, + is_zero_shot = false, +) { + const out_logits = outputs.logits; + const out_bbox = outputs.pred_boxes; + const [batch_size, num_boxes, num_classes] = out_logits.dims; - if (target_sizes !== null && target_sizes.length !== batch_size) { - throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits") - } - let toReturn = []; - for (let i = 0; i < batch_size; ++i) { - let target_size = target_sizes !== null ? target_sizes[i] : null; - let info = { - boxes: [], - classes: [], - scores: [] + if (target_sizes !== null && target_sizes.length !== batch_size) { + throw Error( + "Make sure that you pass in as many target sizes as the batch dimension of the logits", + ); + } + let toReturn = []; + for (let i = 0; i < batch_size; ++i) { + let target_size = target_sizes !== null ? target_sizes[i] : null; + let info = { + boxes: [], + classes: [], + scores: [], + }; + let logits = out_logits[i]; + let bbox = out_bbox[i]; + + for (let j = 0; j < num_boxes; ++j) { + let logit = logits[j]; + + let indices = []; + let probs; + if (is_zero_shot) { + // Get indices of classes with high enough probability + probs = logit.sigmoid().data; + for (let k = 0; k < probs.length; ++k) { + if (probs[k] > threshold) { + indices.push(k); + } } - let logits = out_logits[i]; - let bbox = out_bbox[i]; + } else { + // Get most probable class + let maxIndex = max(logit.data)[1]; - for (let j = 0; j < num_boxes; ++j) { - let logit = logits[j]; - - let indices = []; - let probs; - if (is_zero_shot) { - // Get indices of classes with high enough probability - probs = logit.sigmoid().data; - for (let k = 0; k < probs.length; ++k) { - if (probs[k] > threshold) { - indices.push(k); - } - } - - } else { - // Get most probable class - let maxIndex = max(logit.data)[1]; - - if (maxIndex === num_classes - 1) { - // This is the background class, skip it - continue; - } - indices.push(maxIndex); - - // Compute softmax over classes - probs = softmax(logit.data); - } - - for (const index of indices) { - - // Some class has a high enough probability - /** @type {number[]} */ - let box = bbox[j].data; - - // convert to [x0, y0, x1, y1] format - box = center_to_corners_format(box) - if (target_size !== null) { - box = box.map((x, i) => x * target_size[(i + 1) % 2]) - } - - info.boxes.push(box); - info.classes.push(index); - info.scores.push(probs[index]); - } + if (maxIndex === num_classes - 1) { + // This is the background class, skip it + continue; } - toReturn.push(info); + indices.push(maxIndex); + + // Compute softmax over classes + probs = softmax(logit.data); + } + + for (const index of indices) { + // Some class has a high enough probability + /** @type {number[]} */ + let box = bbox[j].data; + + // convert to [x0, y0, x1, y1] format + box = center_to_corners_format(box); + if (target_size !== null) { + box = box.map((x, i) => x * target_size[(i + 1) % 2]); + } + + info.boxes.push(box); + info.classes.push(index); + info.scores.push(probs[index]); + } } - return toReturn; + toReturn.push(info); + } + return toReturn; } /** @@ -156,12 +152,12 @@ function post_process_object_detection(outputs, threshold = 0.5, target_sizes = * @private */ function validate_audio_inputs(audio, feature_extractor) { - if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { - throw new Error( - `${feature_extractor} expects input to be a Float32Array or a Float64Array, but got ${audio?.constructor?.name ?? typeof audio} instead.` + - `If using the feature extractor directly, remember to use \`read_audio(url, sampling_rate)\` to obtain the raw audio data of the file/url.` - ) - } + if (!(audio instanceof Float32Array || audio instanceof Float64Array)) { + throw new Error( + `${feature_extractor} expects input to be a Float32Array or a Float64Array, but got ${audio?.constructor?.name ?? typeof audio} instead.` + + `If using the feature extractor directly, remember to use \`read_audio(url, sampling_rate)\` to obtain the raw audio data of the file/url.`, + ); + } } /** @@ -170,15 +166,15 @@ function validate_audio_inputs(audio, feature_extractor) { * @extends Callable */ export class FeatureExtractor extends Callable { - /** - * Constructs a new FeatureExtractor instance. - * - * @param {Object} config The configuration for the feature extractor. - */ - constructor(config) { - super(); - this.config = config - } + /** + * Constructs a new FeatureExtractor instance. + * + * @param {Object} config The configuration for the feature extractor. + */ + constructor(config) { + super(); + this.config = config; + } } /** @@ -194,597 +190,638 @@ export class FeatureExtractor extends Callable { * @extends FeatureExtractor */ export class ImageFeatureExtractor extends FeatureExtractor { + /** + * Constructs a new ImageFeatureExtractor instance. + * + * @param {Object} config The configuration for the feature extractor. + * @param {number[]} config.image_mean The mean values for image normalization. + * @param {number[]} config.image_std The standard deviation values for image normalization. + * @param {boolean} config.do_rescale Whether to rescale the image pixel values to the [0,1] range. + * @param {number} config.rescale_factor The factor to use for rescaling the image pixel values. + * @param {boolean} config.do_normalize Whether to normalize the image pixel values. + * @param {boolean} config.do_resize Whether to resize the image. + * @param {number} config.resample What method to use for resampling. + * @param {number} config.size The size to resize the image to. + */ + constructor(config) { + super(config); - /** - * Constructs a new ImageFeatureExtractor instance. - * - * @param {Object} config The configuration for the feature extractor. - * @param {number[]} config.image_mean The mean values for image normalization. - * @param {number[]} config.image_std The standard deviation values for image normalization. - * @param {boolean} config.do_rescale Whether to rescale the image pixel values to the [0,1] range. - * @param {number} config.rescale_factor The factor to use for rescaling the image pixel values. - * @param {boolean} config.do_normalize Whether to normalize the image pixel values. - * @param {boolean} config.do_resize Whether to resize the image. - * @param {number} config.resample What method to use for resampling. - * @param {number} config.size The size to resize the image to. - */ - constructor(config) { - super(config); + this.image_mean = this.config.image_mean ?? this.config.mean; + this.image_std = this.config.image_std ?? this.config.std; - this.image_mean = this.config.image_mean ?? this.config.mean; - this.image_std = this.config.image_std ?? this.config.std; + this.resample = this.config.resample ?? 2; // 2 => bilinear + this.do_rescale = this.config.do_rescale ?? true; + this.rescale_factor = this.config.rescale_factor ?? 1 / 255; + this.do_normalize = this.config.do_normalize; - this.resample = this.config.resample ?? 2; // 2 => bilinear - this.do_rescale = this.config.do_rescale ?? true; - this.rescale_factor = this.config.rescale_factor ?? (1 / 255); - this.do_normalize = this.config.do_normalize; + this.do_resize = this.config.do_resize; + this.do_thumbnail = this.config.do_thumbnail; + this.size = this.config.size; + this.size_divisibility = + this.config.size_divisibility ?? this.config.size_divisor; - this.do_resize = this.config.do_resize; - this.do_thumbnail = this.config.do_thumbnail; - this.size = this.config.size; - this.size_divisibility = this.config.size_divisibility ?? this.config.size_divisor; + this.do_center_crop = this.config.do_center_crop; + this.crop_size = this.config.crop_size; + this.do_convert_rgb = this.config.do_convert_rgb ?? true; + this.do_crop_margin = this.config.do_crop_margin; - this.do_center_crop = this.config.do_center_crop; - this.crop_size = this.config.crop_size; - this.do_convert_rgb = this.config.do_convert_rgb ?? true; - this.do_crop_margin = this.config.do_crop_margin; + this.pad_size = this.config.pad_size; + this.do_pad = this.config.do_pad; - this.pad_size = this.config.pad_size; - this.do_pad = this.config.do_pad; + if ( + this.do_pad && + !this.pad_size && + this.size && + this.size.width !== undefined && + this.size.height !== undefined + ) { + // Should pad, but no pad size specified + // We infer the pad size from the resize size + this.pad_size = this.size; + } + } - if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) { - // Should pad, but no pad size specified - // We infer the pad size from the resize size - this.pad_size = this.size - } + /** + * Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any + * corresponding dimension of the specified size. + * @param {RawImage} image The image to be resized. + * @param {{height:number, width:number}} size The size `{"height": h, "width": w}` to resize the image to. + * @param {string | 0 | 1 | 2 | 3 | 4 | 5} [resample=2] The resampling filter to use. + * @returns {Promise} The resized image. + */ + async thumbnail(image, size, resample = 2) { + const input_height = image.height; + const input_width = image.width; + + const output_height = size.height; + const output_width = size.width; + + // We always resize to the smallest of either the input or output size. + let height = Math.min(input_height, output_height); + let width = Math.min(input_width, output_width); + + if (height === input_height && width === input_width) { + return image; + } + if (input_height > input_width) { + width = Math.floor((input_width * height) / input_height); + } else if (input_width > input_height) { + height = Math.floor((input_height * width) / input_width); + } + return await image.resize(width, height, { resample }); + } + + /** + * Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold). + * @param {RawImage} image The image to be cropped. + * @param {number} gray_threshold Value below which pixels are considered to be gray. + * @returns {Promise} The cropped image. + */ + async crop_margin(image, gray_threshold = 200) { + const gray_image = image.clone().grayscale(); + + const minValue = min(gray_image.data)[0]; + const maxValue = max(gray_image.data)[0]; + const diff = maxValue - minValue; + + if (diff === 0) { + return image; } - /** - * Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any - * corresponding dimension of the specified size. - * @param {RawImage} image The image to be resized. - * @param {{height:number, width:number}} size The size `{"height": h, "width": w}` to resize the image to. - * @param {string | 0 | 1 | 2 | 3 | 4 | 5} [resample=2] The resampling filter to use. - * @returns {Promise} The resized image. - */ - async thumbnail(image, size, resample = 2) { - const input_height = image.height; - const input_width = image.width; + const threshold = gray_threshold / 255; - const output_height = size.height; - const output_width = size.width; - - // We always resize to the smallest of either the input or output size. - let height = Math.min(input_height, output_height) - let width = Math.min(input_width, output_width) - - if (height === input_height && width === input_width) { - return image; + let x_min = gray_image.width, + y_min = gray_image.height, + x_max = 0, + y_max = 0; + for (let j = 0; j < gray_image.height; ++j) { + const row = j * gray_image.width; + for (let i = 0; i < gray_image.width; ++i) { + if ((gray_image.data[row + i] - minValue) / diff < threshold) { + // We have a non-zero pixel, so we update the min/max values accordingly + x_min = Math.min(x_min, i); + y_min = Math.min(y_min, j); + x_max = Math.max(x_max, i); + y_max = Math.max(y_max, j); } - if (input_height > input_width) { - width = Math.floor(input_width * height / input_height); - } else if (input_width > input_height) { - height = Math.floor(input_height * width / input_width); - } - return await image.resize(width, height, { resample }); + } } + image = await image.crop([x_min, y_min, x_max, y_max]); + return image; + } - /** - * Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold). - * @param {RawImage} image The image to be cropped. - * @param {number} gray_threshold Value below which pixels are considered to be gray. - * @returns {Promise} The cropped image. - */ - async crop_margin(image, gray_threshold = 200) { + /** + * Pad the image by a certain amount. + * @param {Float32Array} pixelData The pixel data to pad. + * @param {number[]} imgDims The dimensions of the image. + * @param {{width:number; height:number}|number} padSize The dimensions of the padded image. + * @param {Object} options The options for padding. + * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add. + * @param {boolean} [options.center=false] Whether to center the image. + * @param {number} [options.constant_values=0] The constant value to use for padding. + * @returns {[Float32Array, number[]]} The padded pixel data and image dimensions. + */ + pad_image( + pixelData, + imgDims, + padSize, + { mode = "constant", center = false, constant_values = 0 } = {}, + ) { + const [imageWidth, imageHeight, imageChannels] = imgDims; - const gray_image = image.clone().grayscale(); - - const minValue = min(gray_image.data)[0]; - const maxValue = max(gray_image.data)[0]; - const diff = maxValue - minValue; - - if (diff === 0) { - return image; - } - - const threshold = gray_threshold / 255; - - let x_min = gray_image.width, y_min = gray_image.height, x_max = 0, y_max = 0; - for (let j = 0; j < gray_image.height; ++j) { - const row = j * gray_image.width; - for (let i = 0; i < gray_image.width; ++i) { - if ((gray_image.data[row + i] - minValue) / diff < threshold) { - // We have a non-zero pixel, so we update the min/max values accordingly - x_min = Math.min(x_min, i); - y_min = Math.min(y_min, j); - x_max = Math.max(x_max, i); - y_max = Math.max(y_max, j); - } - } - } - - image = await image.crop([x_min, y_min, x_max, y_max]); - return image; + let paddedImageWidth, paddedImageHeight; + if (typeof padSize === "number") { + paddedImageWidth = padSize; + paddedImageHeight = padSize; + } else { + paddedImageWidth = padSize.width; + paddedImageHeight = padSize.height; } - /** - * Pad the image by a certain amount. - * @param {Float32Array} pixelData The pixel data to pad. - * @param {number[]} imgDims The dimensions of the image. - * @param {{width:number; height:number}|number} padSize The dimensions of the padded image. - * @param {Object} options The options for padding. - * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add. - * @param {boolean} [options.center=false] Whether to center the image. - * @param {number} [options.constant_values=0] The constant value to use for padding. - * @returns {[Float32Array, number[]]} The padded pixel data and image dimensions. - */ - pad_image(pixelData, imgDims, padSize, { - mode = 'constant', - center = false, - constant_values = 0, - } = {}) { - const [imageWidth, imageHeight, imageChannels] = imgDims; - - let paddedImageWidth, paddedImageHeight; - if (typeof padSize === 'number') { - paddedImageWidth = padSize; - paddedImageHeight = padSize; - } else { - paddedImageWidth = padSize.width; - paddedImageHeight = padSize.height; + // Only add padding if there is a difference in size + if (paddedImageWidth !== imageWidth || paddedImageHeight !== imageHeight) { + const paddedPixelData = new Float32Array( + paddedImageWidth * paddedImageHeight * imageChannels, + ); + if (Array.isArray(constant_values)) { + // Fill with constant values, cycling through the array + for (let i = 0; i < paddedPixelData.length; ++i) { + paddedPixelData[i] = constant_values[i % imageChannels]; } + } else if (constant_values !== 0) { + paddedPixelData.fill(constant_values); + } - // Only add padding if there is a difference in size - if (paddedImageWidth !== imageWidth || paddedImageHeight !== imageHeight) { - const paddedPixelData = new Float32Array(paddedImageWidth * paddedImageHeight * imageChannels); - if (Array.isArray(constant_values)) { - // Fill with constant values, cycling through the array - for (let i = 0; i < paddedPixelData.length; ++i) { - paddedPixelData[i] = constant_values[i % imageChannels]; - } - } else if (constant_values !== 0) { - paddedPixelData.fill(constant_values); - } + const [left, top] = center + ? [ + Math.floor((paddedImageWidth - imageWidth) / 2), + Math.floor((paddedImageHeight - imageHeight) / 2), + ] + : [0, 0]; - const [left, top] = center - ? [Math.floor((paddedImageWidth - imageWidth) / 2), Math.floor((paddedImageHeight - imageHeight) / 2)] - : [0, 0]; - - // Copy the original image into the padded image - for (let i = 0; i < imageHeight; ++i) { - const a = (i + top) * paddedImageWidth; - const b = i * imageWidth; - for (let j = 0; j < imageWidth; ++j) { - const c = (a + j + left) * imageChannels; - const d = (b + j) * imageChannels; - for (let k = 0; k < imageChannels; ++k) { - paddedPixelData[c + k] = pixelData[d + k]; - } - } - } - - if (mode === 'symmetric') { - if (center) { - throw new Error('`center` padding is not supported when `mode` is set to `symmetric`.'); - // TODO: Implement this - } - const h1 = imageHeight - 1; - const w1 = imageWidth - 1; - for (let i = 0; i < paddedImageHeight; ++i) { - const a = i * paddedImageWidth; - const b = calculateReflectOffset(i, h1) * imageWidth; - - for (let j = 0; j < paddedImageWidth; ++j) { - if (i < imageHeight && j < imageWidth) continue; // Do not overwrite original image - const c = (a + j) * imageChannels; - const d = (b + calculateReflectOffset(j, w1)) * imageChannels; - - // Copy channel-wise - for (let k = 0; k < imageChannels; ++k) { - paddedPixelData[c + k] = pixelData[d + k]; - } - } - } - } - - - // Update pixel data and image dimensions - pixelData = paddedPixelData; - imgDims = [paddedImageHeight, paddedImageWidth, imageChannels] + // Copy the original image into the padded image + for (let i = 0; i < imageHeight; ++i) { + const a = (i + top) * paddedImageWidth; + const b = i * imageWidth; + for (let j = 0; j < imageWidth; ++j) { + const c = (a + j + left) * imageChannels; + const d = (b + j) * imageChannels; + for (let k = 0; k < imageChannels; ++k) { + paddedPixelData[c + k] = pixelData[d + k]; + } } - return [pixelData, imgDims]; + } + + if (mode === "symmetric") { + if (center) { + throw new Error( + "`center` padding is not supported when `mode` is set to `symmetric`.", + ); + // TODO: Implement this + } + const h1 = imageHeight - 1; + const w1 = imageWidth - 1; + for (let i = 0; i < paddedImageHeight; ++i) { + const a = i * paddedImageWidth; + const b = calculateReflectOffset(i, h1) * imageWidth; + + for (let j = 0; j < paddedImageWidth; ++j) { + if (i < imageHeight && j < imageWidth) continue; // Do not overwrite original image + const c = (a + j) * imageChannels; + const d = (b + calculateReflectOffset(j, w1)) * imageChannels; + + // Copy channel-wise + for (let k = 0; k < imageChannels; ++k) { + paddedPixelData[c + k] = pixelData[d + k]; + } + } + } + } + + // Update pixel data and image dimensions + pixelData = paddedPixelData; + imgDims = [paddedImageHeight, paddedImageWidth, imageChannels]; + } + return [pixelData, imgDims]; + } + + /** + * Rescale the image' pixel values by `this.rescale_factor`. + * @param {Float32Array} pixelData The pixel data to rescale. + * @returns {void} + */ + rescale(pixelData) { + for (let i = 0; i < pixelData.length; ++i) { + pixelData[i] = this.rescale_factor * pixelData[i]; + } + } + + /** + * Find the target (width, height) dimension of the output image after + * resizing given the input image and the desired size. + * @param {RawImage} image The image to resize. + * @param {any} size The size to use for resizing the image. + * @returns {[number, number]} The target (width, height) dimension of the output image after resizing. + */ + get_resize_output_image_size(image, size) { + // `size` comes in many forms, so we need to handle them all here: + // 1. `size` is an integer, in which case we resize the image to be a square + + const [srcWidth, srcHeight] = image.size; + + let shortest_edge; + let longest_edge; + + if (this.do_thumbnail) { + // NOTE: custom logic for `Donut` models + const { height, width } = size; + shortest_edge = Math.min(height, width); + } + // Support both formats for backwards compatibility + else if (Number.isInteger(size)) { + shortest_edge = size; + longest_edge = this.config.max_size ?? shortest_edge; + } else if (size !== undefined) { + // Extract known properties from `size` + shortest_edge = size.shortest_edge; + longest_edge = size.longest_edge; } - /** - * Rescale the image' pixel values by `this.rescale_factor`. - * @param {Float32Array} pixelData The pixel data to rescale. - * @returns {void} - */ - rescale(pixelData) { - for (let i = 0; i < pixelData.length; ++i) { - pixelData[i] = this.rescale_factor * pixelData[i]; - } + // If `longest_edge` and `shortest_edge` are set, maintain aspect ratio and resize to `shortest_edge` + // while keeping the largest dimension <= `longest_edge` + if (shortest_edge !== undefined || longest_edge !== undefined) { + // http://opensourcehacker.com/2011/12/01/calculate-aspect-ratio-conserving-resize-for-images-in-javascript/ + // Try resize so that shortest edge is `shortest_edge` (target) + const shortResizeFactor = + shortest_edge === undefined + ? 1 // If `shortest_edge` is not set, don't upscale + : Math.max(shortest_edge / srcWidth, shortest_edge / srcHeight); + + const newWidth = srcWidth * shortResizeFactor; + const newHeight = srcHeight * shortResizeFactor; + + // The new width and height might be greater than `longest_edge`, so + // we downscale again to ensure the largest dimension is `longest_edge` + const longResizeFactor = + longest_edge === undefined + ? 1 // If `longest_edge` is not set, don't downscale + : Math.min(longest_edge / newWidth, longest_edge / newHeight); + + // To avoid certain floating point precision issues, we round to 2 decimal places + const finalWidth = Math.floor( + Number((newWidth * longResizeFactor).toFixed(2)), + ); + const finalHeight = Math.floor( + Number((newHeight * longResizeFactor).toFixed(2)), + ); + + return [finalWidth, finalHeight]; + } else if ( + size !== undefined && + size.width !== undefined && + size.height !== undefined + ) { + // If `width` and `height` are set, resize to those dimensions + return [size.width, size.height]; + } else if (this.size_divisibility !== undefined) { + // Rounds the height and width down to the closest multiple of size_divisibility + const newWidth = + Math.floor(srcWidth / this.size_divisibility) * this.size_divisibility; + const newHeight = + Math.floor(srcHeight / this.size_divisibility) * this.size_divisibility; + return [newWidth, newHeight]; + } else { + throw new Error( + `Could not resize image due to unsupported \`this.size\` option in config: ${JSON.stringify(size)}`, + ); + } + } + + /** + * Resizes the image. + * @param {RawImage} image The image to resize. + * @returns {Promise} The resized image. + */ + async resize(image) { + const [newWidth, newHeight] = this.get_resize_output_image_size( + image, + this.size, + ); + return await image.resize(newWidth, newHeight, { + resample: this.resample, + }); + } + + /** + * @typedef {object} PreprocessedImage + * @property {HeightWidth} original_size The original size of the image. + * @property {HeightWidth} reshaped_input_size The reshaped input size of the image. + * @property {Tensor} pixel_values The pixel values of the preprocessed image. + */ + + /** + * Preprocesses the given image. + * + * @param {RawImage} image The image to preprocess. + * @param {Object} overrides The overrides for the preprocessing options. + * @returns {Promise} The preprocessed image. + */ + async preprocess( + image, + { + do_normalize = null, + do_pad = null, + do_convert_rgb = null, + do_convert_grayscale = null, + } = {}, + ) { + if (this.do_crop_margin) { + // NOTE: Specific to nougat processors. This is done before resizing, + // and can be interpreted as a pre-preprocessing step. + image = await this.crop_margin(image); } - /** - * Find the target (width, height) dimension of the output image after - * resizing given the input image and the desired size. - * @param {RawImage} image The image to resize. - * @param {any} size The size to use for resizing the image. - * @returns {[number, number]} The target (width, height) dimension of the output image after resizing. - */ - get_resize_output_image_size(image, size) { - // `size` comes in many forms, so we need to handle them all here: - // 1. `size` is an integer, in which case we resize the image to be a square + const [srcWidth, srcHeight] = image.size; // original image size - const [srcWidth, srcHeight] = image.size; - - let shortest_edge; - let longest_edge; - - if (this.do_thumbnail) { - // NOTE: custom logic for `Donut` models - const { height, width } = size; - shortest_edge = Math.min(height, width) - } - // Support both formats for backwards compatibility - else if (Number.isInteger(size)) { - shortest_edge = size; - longest_edge = this.config.max_size ?? shortest_edge; - - } else if (size !== undefined) { - // Extract known properties from `size` - shortest_edge = size.shortest_edge; - longest_edge = size.longest_edge; - } - - // If `longest_edge` and `shortest_edge` are set, maintain aspect ratio and resize to `shortest_edge` - // while keeping the largest dimension <= `longest_edge` - if (shortest_edge !== undefined || longest_edge !== undefined) { - // http://opensourcehacker.com/2011/12/01/calculate-aspect-ratio-conserving-resize-for-images-in-javascript/ - // Try resize so that shortest edge is `shortest_edge` (target) - const shortResizeFactor = shortest_edge === undefined - ? 1 // If `shortest_edge` is not set, don't upscale - : Math.max(shortest_edge / srcWidth, shortest_edge / srcHeight); - - const newWidth = srcWidth * shortResizeFactor; - const newHeight = srcHeight * shortResizeFactor; - - // The new width and height might be greater than `longest_edge`, so - // we downscale again to ensure the largest dimension is `longest_edge` - const longResizeFactor = longest_edge === undefined - ? 1 // If `longest_edge` is not set, don't downscale - : Math.min(longest_edge / newWidth, longest_edge / newHeight); - - // To avoid certain floating point precision issues, we round to 2 decimal places - const finalWidth = Math.floor(Number((newWidth * longResizeFactor).toFixed(2))); - const finalHeight = Math.floor(Number((newHeight * longResizeFactor).toFixed(2))); - - return [finalWidth, finalHeight]; - - } else if (size !== undefined && size.width !== undefined && size.height !== undefined) { - // If `width` and `height` are set, resize to those dimensions - return [size.width, size.height]; - - } else if (this.size_divisibility !== undefined) { - // Rounds the height and width down to the closest multiple of size_divisibility - const newWidth = Math.floor(srcWidth / this.size_divisibility) * this.size_divisibility; - const newHeight = Math.floor(srcHeight / this.size_divisibility) * this.size_divisibility; - return [newWidth, newHeight]; - } else { - throw new Error(`Could not resize image due to unsupported \`this.size\` option in config: ${JSON.stringify(size)}`); - } + // Convert image to RGB if specified in config. + if (do_convert_rgb ?? this.do_convert_rgb) { + image = image.rgb(); + } else if (do_convert_grayscale) { + image = image.grayscale(); } - /** - * Resizes the image. - * @param {RawImage} image The image to resize. - * @returns {Promise} The resized image. - */ - async resize(image) { - const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size); - return await image.resize(newWidth, newHeight, { - resample: this.resample, - }); + // TODO: + // For efficiency reasons, it might be best to merge the resize and center crop operations into one. + + // Resize all images + if (this.do_resize) { + image = await this.resize(image); } - /** - * @typedef {object} PreprocessedImage - * @property {HeightWidth} original_size The original size of the image. - * @property {HeightWidth} reshaped_input_size The reshaped input size of the image. - * @property {Tensor} pixel_values The pixel values of the preprocessed image. - */ - - /** - * Preprocesses the given image. - * - * @param {RawImage} image The image to preprocess. - * @param {Object} overrides The overrides for the preprocessing options. - * @returns {Promise} The preprocessed image. - */ - async preprocess(image, { - do_normalize = null, - do_pad = null, - do_convert_rgb = null, - do_convert_grayscale = null, - } = {}) { - if (this.do_crop_margin) { - // NOTE: Specific to nougat processors. This is done before resizing, - // and can be interpreted as a pre-preprocessing step. - image = await this.crop_margin(image); - } - - const [srcWidth, srcHeight] = image.size; // original image size - - // Convert image to RGB if specified in config. - if (do_convert_rgb ?? this.do_convert_rgb) { - image = image.rgb(); - } else if (do_convert_grayscale) { - image = image.grayscale(); - } - - // TODO: - // For efficiency reasons, it might be best to merge the resize and center crop operations into one. - - // Resize all images - if (this.do_resize) { - image = await this.resize(image); - } - - // Resize the image using thumbnail method. - if (this.do_thumbnail) { - image = await this.thumbnail(image, this.size, this.resample); - } - - if (this.do_center_crop) { - - let crop_width; - let crop_height; - if (Number.isInteger(this.crop_size)) { - crop_width = this.crop_size; - crop_height = this.crop_size; - } else { - crop_width = this.crop_size.width; - crop_height = this.crop_size.height; - } - - image = await image.center_crop(crop_width, crop_height); - } - - /** @type {HeightWidth} */ - const reshaped_input_size = [image.height, image.width]; - - let pixelData = Float32Array.from(image.data); - let imgDims = [image.height, image.width, image.channels]; - - if (this.do_rescale) { - this.rescale(pixelData); - } - - if (do_normalize ?? this.do_normalize) { - let image_mean = this.image_mean; - if (!Array.isArray(this.image_mean)) { - image_mean = new Array(image.channels).fill(image_mean); - } - - let image_std = this.image_std; - if (!Array.isArray(this.image_std)) { - image_std = new Array(image.channels).fill(image_mean); - } - - if (image_mean.length !== image.channels || image_std.length !== image.channels) { - throw new Error(`When set to arrays, the length of \`image_mean\` (${image_mean.length}) and \`image_std\` (${image_std.length}) must match the number of channels in the image (${image.channels}).`); - } - - for (let i = 0; i < pixelData.length; i += image.channels) { - for (let j = 0; j < image.channels; ++j) { - pixelData[i + j] = (pixelData[i + j] - this.image_mean[j]) / this.image_std[j]; - } - } - } - - // do padding after rescaling/normalizing - if (do_pad ?? (this.do_pad && this.pad_size)) { - const padded = this.pad_image(pixelData, [image.width, image.height, image.channels], this.pad_size); - [pixelData, imgDims] = padded; // Update pixel data and image dimensions - } - - // Create HWC tensor - const img = new Tensor('float32', pixelData, imgDims); - - // convert to channel dimension format: - const transposed = transpose(img, [2, 0, 1]); // hwc -> chw - - return { - original_size: [srcHeight, srcWidth], - reshaped_input_size: reshaped_input_size, - pixel_values: transposed, - } + // Resize the image using thumbnail method. + if (this.do_thumbnail) { + image = await this.thumbnail(image, this.size, this.resample); } - /** - * Calls the feature extraction process on an array of images, - * preprocesses each image, and concatenates the resulting - * features into a single Tensor. - * @param {RawImage[]} images The image(s) to extract features from. - * @param {...any} args Additional arguments. - * @returns {Promise} An object containing the concatenated pixel values (and other metadata) of the preprocessed images. - */ - async _call(images, ...args) { - if (!Array.isArray(images)) { - images = [images]; - } - /** @type {PreprocessedImage[]} */ - const imageData = await Promise.all(images.map(x => this.preprocess(x))); + if (this.do_center_crop) { + let crop_width; + let crop_height; + if (Number.isInteger(this.crop_size)) { + crop_width = this.crop_size; + crop_height = this.crop_size; + } else { + crop_width = this.crop_size.width; + crop_height = this.crop_size.height; + } - // Stack pixel values - const pixel_values = stack(imageData.map(x => x.pixel_values), 0); - - return { - pixel_values: pixel_values, - - // Original sizes of images - original_sizes: imageData.map(x => x.original_size), - - // Reshaped sizes of images, before padding or cropping - reshaped_input_sizes: imageData.map(x => x.reshaped_input_size), - } + image = await image.center_crop(crop_width, crop_height); } + /** @type {HeightWidth} */ + const reshaped_input_size = [image.height, image.width]; + + let pixelData = Float32Array.from(image.data); + let imgDims = [image.height, image.width, image.channels]; + + if (this.do_rescale) { + this.rescale(pixelData); + } + + if (do_normalize ?? this.do_normalize) { + let image_mean = this.image_mean; + if (!Array.isArray(this.image_mean)) { + image_mean = new Array(image.channels).fill(image_mean); + } + + let image_std = this.image_std; + if (!Array.isArray(this.image_std)) { + image_std = new Array(image.channels).fill(image_mean); + } + + if ( + image_mean.length !== image.channels || + image_std.length !== image.channels + ) { + throw new Error( + `When set to arrays, the length of \`image_mean\` (${image_mean.length}) and \`image_std\` (${image_std.length}) must match the number of channels in the image (${image.channels}).`, + ); + } + + for (let i = 0; i < pixelData.length; i += image.channels) { + for (let j = 0; j < image.channels; ++j) { + pixelData[i + j] = + (pixelData[i + j] - this.image_mean[j]) / this.image_std[j]; + } + } + } + + // do padding after rescaling/normalizing + if (do_pad ?? (this.do_pad && this.pad_size)) { + const padded = this.pad_image( + pixelData, + [image.width, image.height, image.channels], + this.pad_size, + ); + [pixelData, imgDims] = padded; // Update pixel data and image dimensions + } + + // Create HWC tensor + const img = new Tensor("float32", pixelData, imgDims); + + // convert to channel dimension format: + const transposed = transpose(img, [2, 0, 1]); // hwc -> chw + + return { + original_size: [srcHeight, srcWidth], + reshaped_input_size: reshaped_input_size, + pixel_values: transposed, + }; + } + + /** + * Calls the feature extraction process on an array of images, + * preprocesses each image, and concatenates the resulting + * features into a single Tensor. + * @param {RawImage[]} images The image(s) to extract features from. + * @param {...any} args Additional arguments. + * @returns {Promise} An object containing the concatenated pixel values (and other metadata) of the preprocessed images. + */ + async _call(images, ...args) { + if (!Array.isArray(images)) { + images = [images]; + } + /** @type {PreprocessedImage[]} */ + const imageData = await Promise.all(images.map((x) => this.preprocess(x))); + + // Stack pixel values + const pixel_values = stack( + imageData.map((x) => x.pixel_values), + 0, + ); + + return { + pixel_values: pixel_values, + + // Original sizes of images + original_sizes: imageData.map((x) => x.original_size), + + // Reshaped sizes of images, before padding or cropping + reshaped_input_sizes: imageData.map((x) => x.reshaped_input_size), + }; + } } export class SegformerFeatureExtractor extends ImageFeatureExtractor { + /** + * Converts the output of `SegformerForSemanticSegmentation` into semantic segmentation maps. + * @param {*} outputs Raw outputs of the model. + * @param {number[][]} [target_sizes=null] List of tuples corresponding to the requested final size + * (height, width) of each prediction. If unset, predictions will not be resized. + * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps. + */ + post_process_semantic_segmentation(outputs, target_sizes = null) { + const logits = outputs.logits; + const batch_size = logits.dims[0]; + + if (target_sizes !== null && target_sizes.length !== batch_size) { + throw Error( + "Make sure that you pass in as many target sizes as the batch dimension of the logits", + ); + } + + const toReturn = []; + for (let i = 0; i < batch_size; ++i) { + const target_size = target_sizes !== null ? target_sizes[i] : null; + + let data = logits[i]; + + // 1. If target_size is not null, we need to resize the masks to the target size + if (target_size !== null) { + // resize the masks to the target size + data = interpolate(data, target_size, "bilinear", false); + } + const [height, width] = target_size ?? data.dims.slice(-2); + + const segmentation = new Tensor("int32", new Int32Array(height * width), [ + height, + width, + ]); + + // Buffer to store current largest value + const buffer = data[0].data; + for (let j = 1; j < data.dims[0]; ++j) { + const row = data[j].data; + for (let k = 0; k < row.length; ++k) { + if (row[k] > buffer[k]) { + buffer[k] = row[k]; + segmentation.data[k] = j; + } + } + } + + // Store which objects have labels + // This is much more efficient that creating a set of the final values + const hasLabel = new Array(data.dims[0]); + const out = segmentation.data; + for (let j = 0; j < out.length; ++j) { + const index = out[j]; + hasLabel[index] = index; + } + /** @type {number[]} The unique list of labels that were detected */ + const labels = hasLabel.filter((x) => x !== undefined); + + toReturn.push({ segmentation, labels }); + } + return toReturn; + } +} +export class BitImageProcessor extends ImageFeatureExtractor {} +export class DPTFeatureExtractor extends ImageFeatureExtractor {} +export class GLPNFeatureExtractor extends ImageFeatureExtractor {} +export class CLIPFeatureExtractor extends ImageFeatureExtractor {} +export class ChineseCLIPFeatureExtractor extends ImageFeatureExtractor {} +export class SiglipImageProcessor extends ImageFeatureExtractor {} +export class ConvNextFeatureExtractor extends ImageFeatureExtractor { + constructor(config) { + super(config); /** - * Converts the output of `SegformerForSemanticSegmentation` into semantic segmentation maps. - * @param {*} outputs Raw outputs of the model. - * @param {number[][]} [target_sizes=null] List of tuples corresponding to the requested final size - * (height, width) of each prediction. If unset, predictions will not be resized. - * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps. + * Percentage of the image to crop. Only has an effect if this.size < 384. */ - post_process_semantic_segmentation(outputs, target_sizes = null) { + this.crop_pct = this.config.crop_pct ?? 224 / 256; + } - const logits = outputs.logits; - const batch_size = logits.dims[0]; - - if (target_sizes !== null && target_sizes.length !== batch_size) { - throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits") - } - - const toReturn = []; - for (let i = 0; i < batch_size; ++i) { - const target_size = target_sizes !== null ? target_sizes[i] : null; - - let data = logits[i]; - - // 1. If target_size is not null, we need to resize the masks to the target size - if (target_size !== null) { - // resize the masks to the target size - data = interpolate(data, target_size, 'bilinear', false); - } - const [height, width] = target_size ?? data.dims.slice(-2); - - const segmentation = new Tensor( - 'int32', - new Int32Array(height * width), - [height, width] - ); - - // Buffer to store current largest value - const buffer = data[0].data; - for (let j = 1; j < data.dims[0]; ++j) { - const row = data[j].data; - for (let k = 0; k < row.length; ++k) { - if (row[k] > buffer[k]) { - buffer[k] = row[k]; - segmentation.data[k] = j; - } - } - } - - // Store which objects have labels - // This is much more efficient that creating a set of the final values - const hasLabel = new Array(data.dims[0]); - const out = segmentation.data; - for (let j = 0; j < out.length; ++j) { - const index = out[j]; - hasLabel[index] = index; - } - /** @type {number[]} The unique list of labels that were detected */ - const labels = hasLabel.filter(x => x !== undefined); - - toReturn.push({ segmentation, labels }); - } - return toReturn; + async resize(image) { + const shortest_edge = this.size?.shortest_edge; + if (shortest_edge === undefined) { + throw new Error(`Size dictionary must contain 'shortest_edge' key.`); } + + if (shortest_edge < 384) { + // maintain same ratio, resizing shortest edge to shortest_edge/crop_pct + const resize_shortest_edge = Math.floor(shortest_edge / this.crop_pct); + + const [newWidth, newHeight] = this.get_resize_output_image_size(image, { + shortest_edge: resize_shortest_edge, + }); + + image = await image.resize(newWidth, newHeight, { + resample: this.resample, + }); + + // then crop to (shortest_edge, shortest_edge) + image = await image.center_crop(shortest_edge, shortest_edge); + } else { + // warping (no cropping) when evaluated at 384 or larger + image = await image.resize(shortest_edge, shortest_edge, { + resample: this.resample, + }); + } + + return image; + } } -export class BitImageProcessor extends ImageFeatureExtractor { } -export class DPTFeatureExtractor extends ImageFeatureExtractor { } -export class GLPNFeatureExtractor extends ImageFeatureExtractor { } -export class CLIPFeatureExtractor extends ImageFeatureExtractor { } -export class ChineseCLIPFeatureExtractor extends ImageFeatureExtractor { } -export class SiglipImageProcessor extends ImageFeatureExtractor { } -export class ConvNextFeatureExtractor extends ImageFeatureExtractor { - constructor(config) { - super(config); +export class ConvNextImageProcessor extends ConvNextFeatureExtractor {} // NOTE extends ConvNextFeatureExtractor +export class ViTFeatureExtractor extends ImageFeatureExtractor {} +export class ViTImageProcessor extends ImageFeatureExtractor {} - /** - * Percentage of the image to crop. Only has an effect if this.size < 384. - */ - this.crop_pct = this.config.crop_pct ?? (224 / 256); - } - - async resize(image) { - const shortest_edge = this.size?.shortest_edge; - if (shortest_edge === undefined) { - throw new Error(`Size dictionary must contain 'shortest_edge' key.`); - } - - if (shortest_edge < 384) { - // maintain same ratio, resizing shortest edge to shortest_edge/crop_pct - const resize_shortest_edge = Math.floor(shortest_edge / this.crop_pct); - - const [newWidth, newHeight] = this.get_resize_output_image_size(image, { - shortest_edge: resize_shortest_edge, - }); - - image = await image.resize(newWidth, newHeight, { - resample: this.resample, - }); - - // then crop to (shortest_edge, shortest_edge) - image = await image.center_crop(shortest_edge, shortest_edge); - } else { - // warping (no cropping) when evaluated at 384 or larger - image = await image.resize(shortest_edge, shortest_edge, { - resample: this.resample, - }); - } - - return image; - } -} -export class ConvNextImageProcessor extends ConvNextFeatureExtractor { } // NOTE extends ConvNextFeatureExtractor -export class ViTFeatureExtractor extends ImageFeatureExtractor { } -export class ViTImageProcessor extends ImageFeatureExtractor { } - -export class MobileViTFeatureExtractor extends ImageFeatureExtractor { } +export class MobileViTFeatureExtractor extends ImageFeatureExtractor {} export class OwlViTFeatureExtractor extends ImageFeatureExtractor { - /** @type {post_process_object_detection} */ - post_process_object_detection(...args) { - return post_process_object_detection(...args); - } + /** @type {post_process_object_detection} */ + post_process_object_detection(...args) { + return post_process_object_detection(...args); + } } -export class DeiTFeatureExtractor extends ImageFeatureExtractor { } -export class BeitFeatureExtractor extends ImageFeatureExtractor { } +export class DeiTFeatureExtractor extends ImageFeatureExtractor {} +export class BeitFeatureExtractor extends ImageFeatureExtractor {} export class DonutFeatureExtractor extends ImageFeatureExtractor { - pad_image(pixelData, imgDims, padSize, options = {}) { - const [imageWidth, imageHeight, imageChannels] = imgDims; + pad_image(pixelData, imgDims, padSize, options = {}) { + const [imageWidth, imageHeight, imageChannels] = imgDims; - let image_mean = this.image_mean; - if (!Array.isArray(this.image_mean)) { - image_mean = new Array(imageChannels).fill(image_mean); - } - - let image_std = this.image_std; - if (!Array.isArray(image_std)) { - image_std = new Array(imageChannels).fill(image_mean); - } - - const constant_values = image_mean.map((x, i) => - x / this.image_std[i]); - - return super.pad_image(pixelData, imgDims, padSize, { - center: true, - - // Since normalization is done after padding, we need to use certain constant values to ensure the same behaviour is observed. - // For more information, see https://github.com/huggingface/transformers/blob/main/src/transformers/models/donut/image_processing_donut.py#L433-L451 - constant_values: constant_values, - ...options, - }); + let image_mean = this.image_mean; + if (!Array.isArray(this.image_mean)) { + image_mean = new Array(imageChannels).fill(image_mean); } + + let image_std = this.image_std; + if (!Array.isArray(image_std)) { + image_std = new Array(imageChannels).fill(image_mean); + } + + const constant_values = image_mean.map((x, i) => -x / this.image_std[i]); + + return super.pad_image(pixelData, imgDims, padSize, { + center: true, + + // Since normalization is done after padding, we need to use certain constant values to ensure the same behaviour is observed. + // For more information, see https://github.com/huggingface/transformers/blob/main/src/transformers/models/donut/image_processing_donut.py#L433-L451 + constant_values: constant_values, + ...options, + }); + } } -export class NougatImageProcessor extends DonutFeatureExtractor { } // NOTE extends DonutFeatureExtractor +export class NougatImageProcessor extends DonutFeatureExtractor {} // NOTE extends DonutFeatureExtractor /** * @typedef {object} DetrFeatureExtractorResultProps @@ -798,323 +835,336 @@ export class NougatImageProcessor extends DonutFeatureExtractor { } // NOTE exte * @extends ImageFeatureExtractor */ export class DetrFeatureExtractor extends ImageFeatureExtractor { - /** - * Calls the feature extraction process on an array of images, preprocesses - * each image, and concatenates the resulting features into a single Tensor. - * @param {RawImage[]} images The image(s) to extract features from. - * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. - */ - async _call(images) { - const result = await super._call(images); + /** + * Calls the feature extraction process on an array of images, preprocesses + * each image, and concatenates the resulting features into a single Tensor. + * @param {RawImage[]} images The image(s) to extract features from. + * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. + */ + async _call(images) { + const result = await super._call(images); - // TODO support differently-sized images, for now assume all images are the same size. - // TODO support different mask sizes (not just 64x64) - // Currently, just fill pixel mask with 1s - const maskSize = [result.pixel_values.dims[0], 64, 64]; - const pixel_mask = new Tensor( - 'int64', - new BigInt64Array(maskSize.reduce((a, b) => a * b)).fill(1n), - maskSize + // TODO support differently-sized images, for now assume all images are the same size. + // TODO support different mask sizes (not just 64x64) + // Currently, just fill pixel mask with 1s + const maskSize = [result.pixel_values.dims[0], 64, 64]; + const pixel_mask = new Tensor( + "int64", + new BigInt64Array(maskSize.reduce((a, b) => a * b)).fill(1n), + maskSize, + ); + + return { ...result, pixel_mask }; + } + + /** + * Post-processes the outputs of the model (for object detection). + * @param {Object} outputs The outputs of the model that must be post-processed + * @param {Tensor} outputs.logits The logits + * @param {Tensor} outputs.pred_boxes The predicted boxes. + * @return {Object[]} An array of objects containing the post-processed outputs. + */ + + /** @type {post_process_object_detection} */ + post_process_object_detection(...args) { + return post_process_object_detection(...args); + } + + /** + * Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`. + * @param {Tensor} class_logits The class logits. + * @param {Tensor} mask_logits The mask logits. + * @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks. + * @param {number} num_labels The number of labels. + * @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels. + */ + remove_low_and_no_objects( + class_logits, + mask_logits, + object_mask_threshold, + num_labels, + ) { + let mask_probs_item = []; + let pred_scores_item = []; + let pred_labels_item = []; + + for (let j = 0; j < class_logits.dims[0]; ++j) { + let cls = class_logits[j]; + let mask = mask_logits[j]; + + let pred_label = max(cls.data)[1]; + if (pred_label === num_labels) { + // Is the background, so we ignore it + continue; + } + + let scores = softmax(cls.data); + let pred_score = scores[pred_label]; + if (pred_score > object_mask_threshold) { + mask_probs_item.push(mask); + pred_scores_item.push(pred_score); + pred_labels_item.push(pred_label); + } + } + + return [mask_probs_item, pred_scores_item, pred_labels_item]; + } + + /** + * Checks whether the segment is valid or not. + * @param {Int32Array} mask_labels Labels for each pixel in the mask. + * @param {Tensor[]} mask_probs Probabilities for each pixel in the masks. + * @param {number} k The class id of the segment. + * @param {number} mask_threshold The mask threshold. + * @param {number} overlap_mask_area_threshold The overlap mask area threshold. + * @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels. + */ + check_segment_validity( + mask_labels, + mask_probs, + k, + mask_threshold = 0.5, + overlap_mask_area_threshold = 0.8, + ) { + // mask_k is a 1D array of indices, indicating where the mask is equal to k + let mask_k = []; + let mask_k_area = 0; + let original_area = 0; + + // Compute the area of all the stuff in query k + for (let i = 0; i < mask_labels.length; ++i) { + if (mask_labels[i] === k) { + mask_k.push(i); + ++mask_k_area; + } + + if (mask_probs[k].data[i] >= mask_threshold) { + ++original_area; + } + } + let mask_exists = mask_k_area > 0 && original_area > 0; + + // Eliminate disconnected tiny segments + if (mask_exists) { + // Perform additional check + let area_ratio = mask_k_area / original_area; + mask_exists = area_ratio > overlap_mask_area_threshold; + } + + return [mask_exists, mask_k]; + } + + /** + * Computes the segments. + * @param {Tensor[]} mask_probs The mask probabilities. + * @param {number[]} pred_scores The predicted scores. + * @param {number[]} pred_labels The predicted labels. + * @param {number} mask_threshold The mask threshold. + * @param {number} overlap_mask_area_threshold The overlap mask area threshold. + * @param {Set} label_ids_to_fuse The label ids to fuse. + * @param {number[]} target_size The target size of the image. + * @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments. + */ + compute_segments( + mask_probs, + pred_scores, + pred_labels, + mask_threshold, + overlap_mask_area_threshold, + label_ids_to_fuse = null, + target_size = null, + ) { + let [height, width] = target_size ?? mask_probs[0].dims; + + let segmentation = new Tensor("int32", new Int32Array(height * width), [ + height, + width, + ]); + let segments = []; + + // 1. If target_size is not null, we need to resize the masks to the target size + if (target_size !== null) { + // resize the masks to the target size + for (let i = 0; i < mask_probs.length; ++i) { + mask_probs[i] = interpolate( + mask_probs[i], + target_size, + "bilinear", + false, ); - - return { ...result, pixel_mask }; + } } - /** - * Post-processes the outputs of the model (for object detection). - * @param {Object} outputs The outputs of the model that must be post-processed - * @param {Tensor} outputs.logits The logits - * @param {Tensor} outputs.pred_boxes The predicted boxes. - * @return {Object[]} An array of objects containing the post-processed outputs. - */ + // 2. Weigh each mask by its prediction score + // NOTE: `mask_probs` is updated in-place + // + // Temporary storage for the best label/scores for each pixel ([height, width]): + let mask_labels = new Int32Array(mask_probs[0].data.length); + let bestScores = new Float32Array(mask_probs[0].data.length); - /** @type {post_process_object_detection} */ - post_process_object_detection(...args) { - return post_process_object_detection(...args); - } + for (let i = 0; i < mask_probs.length; ++i) { + let score = pred_scores[i]; - /** - * Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`. - * @param {Tensor} class_logits The class logits. - * @param {Tensor} mask_logits The mask logits. - * @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks. - * @param {number} num_labels The number of labels. - * @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels. - */ - remove_low_and_no_objects(class_logits, mask_logits, object_mask_threshold, num_labels) { - - let mask_probs_item = []; - let pred_scores_item = []; - let pred_labels_item = []; - - for (let j = 0; j < class_logits.dims[0]; ++j) { - let cls = class_logits[j]; - let mask = mask_logits[j]; - - let pred_label = max(cls.data)[1]; - if (pred_label === num_labels) { - // Is the background, so we ignore it - continue; - } - - let scores = softmax(cls.data); - let pred_score = scores[pred_label]; - if (pred_score > object_mask_threshold) { - mask_probs_item.push(mask); - pred_scores_item.push(pred_score); - pred_labels_item.push(pred_label); - } + for (let j = 0; j < mask_probs[i].data.length; ++j) { + mask_probs[i].data[j] *= score; + if (mask_probs[i].data[j] > bestScores[j]) { + mask_labels[j] = i; + bestScores[j] = mask_probs[i].data[j]; } - - return [mask_probs_item, pred_scores_item, pred_labels_item]; - + } } - /** - * Checks whether the segment is valid or not. - * @param {Int32Array} mask_labels Labels for each pixel in the mask. - * @param {Tensor[]} mask_probs Probabilities for each pixel in the masks. - * @param {number} k The class id of the segment. - * @param {number} mask_threshold The mask threshold. - * @param {number} overlap_mask_area_threshold The overlap mask area threshold. - * @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels. - */ - check_segment_validity( + let current_segment_id = 0; + + // let stuff_memory_list = {} + for (let k = 0; k < pred_labels.length; ++k) { + let pred_class = pred_labels[k]; + + // TODO add `should_fuse` + // let should_fuse = pred_class in label_ids_to_fuse + + // Check if mask exists and large enough to be a segment + let [mask_exists, mask_k] = this.check_segment_validity( mask_labels, mask_probs, k, - mask_threshold = 0.5, - overlap_mask_area_threshold = 0.8 - ) { - // mask_k is a 1D array of indices, indicating where the mask is equal to k - let mask_k = []; - let mask_k_area = 0; - let original_area = 0; - - // Compute the area of all the stuff in query k - for (let i = 0; i < mask_labels.length; ++i) { - if (mask_labels[i] === k) { - mask_k.push(i); - ++mask_k_area; - } - - if (mask_probs[k].data[i] >= mask_threshold) { - ++original_area; - } - } - let mask_exists = mask_k_area > 0 && original_area > 0; - - // Eliminate disconnected tiny segments - if (mask_exists) { - // Perform additional check - let area_ratio = mask_k_area / original_area; - mask_exists = area_ratio > overlap_mask_area_threshold; - } - - return [mask_exists, mask_k] - } - - /** - * Computes the segments. - * @param {Tensor[]} mask_probs The mask probabilities. - * @param {number[]} pred_scores The predicted scores. - * @param {number[]} pred_labels The predicted labels. - * @param {number} mask_threshold The mask threshold. - * @param {number} overlap_mask_area_threshold The overlap mask area threshold. - * @param {Set} label_ids_to_fuse The label ids to fuse. - * @param {number[]} target_size The target size of the image. - * @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments. - */ - compute_segments( - mask_probs, - pred_scores, - pred_labels, mask_threshold, overlap_mask_area_threshold, - label_ids_to_fuse = null, - target_size = null, - ) { - let [height, width] = target_size ?? mask_probs[0].dims; + ); + + if (!mask_exists) { + // Nothing to see here + continue; + } + + // TODO + // if (pred_class in stuff_memory_list) { + // current_segment_id = stuff_memory_list[pred_class] + // } else { + // current_segment_id += 1; + // } + ++current_segment_id; + + // Add current object segment to final segmentation map + for (let index of mask_k) { + segmentation.data[index] = current_segment_id; + } + + segments.push({ + id: current_segment_id, + label_id: pred_class, + // was_fused: should_fuse, TODO + score: pred_scores[k], + }); + + // TODO + // if(should_fuse){ + // stuff_memory_list[pred_class] = current_segment_id + // } + } + + return [segmentation, segments]; + } + + /** + * Post-process the model output to generate the final panoptic segmentation. + * @param {*} outputs The model output to post process + * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks. + * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values. + * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask. + * @param {Set} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together. + * @param {number[][]} [target_sizes=null] The target sizes to resize the masks to. + * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>} + */ + post_process_panoptic_segmentation( + outputs, + threshold = 0.5, + mask_threshold = 0.5, + overlap_mask_area_threshold = 0.8, + label_ids_to_fuse = null, + target_sizes = null, + ) { + if (label_ids_to_fuse === null) { + console.warn("`label_ids_to_fuse` unset. No instance will be fused."); + label_ids_to_fuse = new Set(); + } + + const class_queries_logits = outputs.logits; // [batch_size, num_queries, num_classes+1] + const masks_queries_logits = outputs.pred_masks; // [batch_size, num_queries, height, width] + + const mask_probs = masks_queries_logits.sigmoid(); // [batch_size, num_queries, height, width] + + let [batch_size, num_queries, num_labels] = class_queries_logits.dims; + num_labels -= 1; // Remove last class (background) + + if (target_sizes !== null && target_sizes.length !== batch_size) { + throw Error( + "Make sure that you pass in as many target sizes as the batch dimension of the logits", + ); + } + + let toReturn = []; + for (let i = 0; i < batch_size; ++i) { + let target_size = target_sizes !== null ? target_sizes[i] : null; + + let class_logits = class_queries_logits[i]; + let mask_logits = mask_probs[i]; + + let [mask_probs_item, pred_scores_item, pred_labels_item] = + this.remove_low_and_no_objects( + class_logits, + mask_logits, + threshold, + num_labels, + ); + + if (pred_labels_item.length === 0) { + // No mask found + let [height, width] = target_size ?? mask_logits.dims.slice(-2); let segmentation = new Tensor( - 'int32', - new Int32Array(height * width), - [height, width] + "int32", + new Int32Array(height * width).fill(-1), + [height, width], ); - let segments = []; + toReturn.push({ + segmentation: segmentation, + segments_info: [], + }); + continue; + } - // 1. If target_size is not null, we need to resize the masks to the target size - if (target_size !== null) { - // resize the masks to the target size - for (let i = 0; i < mask_probs.length; ++i) { - mask_probs[i] = interpolate(mask_probs[i], target_size, 'bilinear', false); - } - } + // Get segmentation map and segment information of batch item + let [segmentation, segments] = this.compute_segments( + mask_probs_item, + pred_scores_item, + pred_labels_item, + mask_threshold, + overlap_mask_area_threshold, + label_ids_to_fuse, + target_size, + ); - // 2. Weigh each mask by its prediction score - // NOTE: `mask_probs` is updated in-place - // - // Temporary storage for the best label/scores for each pixel ([height, width]): - let mask_labels = new Int32Array(mask_probs[0].data.length); - let bestScores = new Float32Array(mask_probs[0].data.length); - - for (let i = 0; i < mask_probs.length; ++i) { - let score = pred_scores[i]; - - for (let j = 0; j < mask_probs[i].data.length; ++j) { - mask_probs[i].data[j] *= score - if (mask_probs[i].data[j] > bestScores[j]) { - mask_labels[j] = i; - bestScores[j] = mask_probs[i].data[j]; - } - } - } - - let current_segment_id = 0; - - // let stuff_memory_list = {} - for (let k = 0; k < pred_labels.length; ++k) { - let pred_class = pred_labels[k]; - - // TODO add `should_fuse` - // let should_fuse = pred_class in label_ids_to_fuse - - // Check if mask exists and large enough to be a segment - let [mask_exists, mask_k] = this.check_segment_validity( - mask_labels, - mask_probs, - k, - mask_threshold, - overlap_mask_area_threshold - ) - - if (!mask_exists) { - // Nothing to see here - continue; - } - - // TODO - // if (pred_class in stuff_memory_list) { - // current_segment_id = stuff_memory_list[pred_class] - // } else { - // current_segment_id += 1; - // } - ++current_segment_id; - - - // Add current object segment to final segmentation map - for (let index of mask_k) { - segmentation.data[index] = current_segment_id; - } - - segments.push({ - id: current_segment_id, - label_id: pred_class, - // was_fused: should_fuse, TODO - score: pred_scores[k], - }) - - // TODO - // if(should_fuse){ - // stuff_memory_list[pred_class] = current_segment_id - // } - } - - return [segmentation, segments]; + toReturn.push({ + segmentation: segmentation, + segments_info: segments, + }); } - /** - * Post-process the model output to generate the final panoptic segmentation. - * @param {*} outputs The model output to post process - * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks. - * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values. - * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask. - * @param {Set} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together. - * @param {number[][]} [target_sizes=null] The target sizes to resize the masks to. - * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>} - */ - post_process_panoptic_segmentation( - outputs, - threshold = 0.5, - mask_threshold = 0.5, - overlap_mask_area_threshold = 0.8, - label_ids_to_fuse = null, - target_sizes = null, - ) { - if (label_ids_to_fuse === null) { - console.warn("`label_ids_to_fuse` unset. No instance will be fused.") - label_ids_to_fuse = new Set(); - } + return toReturn; + } - const class_queries_logits = outputs.logits; // [batch_size, num_queries, num_classes+1] - const masks_queries_logits = outputs.pred_masks; // [batch_size, num_queries, height, width] - - const mask_probs = masks_queries_logits.sigmoid() // [batch_size, num_queries, height, width] - - let [batch_size, num_queries, num_labels] = class_queries_logits.dims; - num_labels -= 1; // Remove last class (background) - - if (target_sizes !== null && target_sizes.length !== batch_size) { - throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits") - } - - let toReturn = []; - for (let i = 0; i < batch_size; ++i) { - let target_size = target_sizes !== null ? target_sizes[i] : null; - - let class_logits = class_queries_logits[i]; - let mask_logits = mask_probs[i]; - - let [mask_probs_item, pred_scores_item, pred_labels_item] = this.remove_low_and_no_objects(class_logits, mask_logits, threshold, num_labels); - - if (pred_labels_item.length === 0) { - // No mask found - let [height, width] = target_size ?? mask_logits.dims.slice(-2); - - let segmentation = new Tensor( - 'int32', - new Int32Array(height * width).fill(-1), - [height, width] - ) - toReturn.push({ - segmentation: segmentation, - segments_info: [] - }); - continue; - } - - - // Get segmentation map and segment information of batch item - let [segmentation, segments] = this.compute_segments( - mask_probs_item, - pred_scores_item, - pred_labels_item, - mask_threshold, - overlap_mask_area_threshold, - label_ids_to_fuse, - target_size, - ) - - toReturn.push({ - segmentation: segmentation, - segments_info: segments - }) - } - - return toReturn; - } - - post_process_instance_segmentation() { - // TODO - throw Error("Not implemented yet"); - } + post_process_instance_segmentation() { + // TODO + throw Error("Not implemented yet"); + } } export class YolosFeatureExtractor extends ImageFeatureExtractor { - /** @type {post_process_object_detection} */ - post_process_object_detection(...args) { - return post_process_object_detection(...args); - } + /** @type {post_process_object_detection} */ + post_process_object_detection(...args) { + return post_process_object_detection(...args); + } } /** @@ -1127,673 +1177,710 @@ export class YolosFeatureExtractor extends ImageFeatureExtractor { */ export class SamImageProcessor extends ImageFeatureExtractor { + /** + * + * @param {any} input_points + * @param {HeightWidth[]} original_sizes + * @param {HeightWidth[]} reshaped_input_sizes + * @returns {Tensor} + */ + reshape_input_points(input_points, original_sizes, reshaped_input_sizes) { + // Make deep copy to avoid altering user's input + input_points = structuredClone(input_points); + let shape = calculateDimensions(input_points); - /** - * - * @param {any} input_points - * @param {HeightWidth[]} original_sizes - * @param {HeightWidth[]} reshaped_input_sizes - * @returns {Tensor} - */ - reshape_input_points(input_points, original_sizes, reshaped_input_sizes) { + // TODO: add support for 2D input_points + if (shape.length === 3) { + // Correct user's input + shape = [1, ...shape]; + input_points = [input_points]; + } else if (shape.length !== 4) { + throw Error( + "The input_points must be a 4D tensor of shape `batch_size`, `point_batch_size`, `nb_points_per_image`, `2`.", + ); + } - // Make deep copy to avoid altering user's input - input_points = structuredClone(input_points); - let shape = calculateDimensions(input_points); + // Reshape input points + for (let i = 0; i < input_points.length; ++i) { + // batch_size + let originalImageSize = original_sizes[i]; + let reshapedImageSize = reshaped_input_sizes[i]; - // TODO: add support for 2D input_points - if (shape.length === 3) { - // Correct user's input - shape = [1, ...shape]; - input_points = [input_points]; - } else if (shape.length !== 4) { - throw Error("The input_points must be a 4D tensor of shape `batch_size`, `point_batch_size`, `nb_points_per_image`, `2`.") + let resizeFactors = [ + reshapedImageSize[0] / originalImageSize[0], + reshapedImageSize[1] / originalImageSize[1], + ]; + + for (let j = 0; j < input_points[i].length; ++j) { + // point_batch_size + for (let k = 0; k < input_points[i][j].length; ++k) { + // nb_points_per_image + for (let w = 0; w < input_points[i][j][k].length; ++w) { + // 2 + input_points[i][j][k][w] *= resizeFactors[w]; + } } + } + } - // Reshape input points - for (let i = 0; i < input_points.length; ++i) { // batch_size - let originalImageSize = original_sizes[i]; - let reshapedImageSize = reshaped_input_sizes[i]; + return new Tensor( + "float32", + Float32Array.from(input_points.flat(Infinity)), + shape, + ); + } - let resizeFactors = [ - reshapedImageSize[0] / originalImageSize[0], - reshapedImageSize[1] / originalImageSize[1] - ] + /** + * + * @param {any} input_labels + * @param {Tensor} input_points + * @returns {Tensor} + */ + add_input_labels(input_labels, input_points) { + let shape = calculateDimensions(input_labels); + if (shape.length === 2) { + // Correct user's input + shape = [1, ...shape]; + input_labels = [input_labels]; + } else if (shape.length !== 3) { + throw Error( + "The input_points must be a 4D tensor of shape `batch_size`, `point_batch_size`, `nb_points_per_image`, `2`.", + ); + } - for (let j = 0; j < input_points[i].length; ++j) { // point_batch_size - for (let k = 0; k < input_points[i][j].length; ++k) { // nb_points_per_image - for (let w = 0; w < input_points[i][j][k].length; ++w) { // 2 - input_points[i][j][k][w] *= resizeFactors[w]; - } - } + if (shape.some((x, i) => x !== input_points.dims[i])) { + throw Error( + `The first ${shape.length} dimensions of 'input_points' and 'input_labels' must be the same.`, + ); + } + return new Tensor("int64", input_labels.flat(Infinity).map(BigInt), shape); + } + /** + * @param {any[]} images The URL(s) of the image(s) to extract features from. + * @param {any} [input_points] A 3D or 4D array, representing the input points provided by the user. + * - 3D: `[point_batch_size, nb_points_per_image, 2]`. In this case, `batch_size` is assumed to be 1. + * - 4D: `[batch_size, point_batch_size, nb_points_per_image, 2]`. + * @param {any} [input_labels] A 2D or 3D array, representing the input labels for the points, used by the prompt encoder to encode the prompt. + * - 2D: `[point_batch_size, nb_points_per_image]`. In this case, `batch_size` is assumed to be 1. + * - 3D: `[batch_size, point_batch_size, nb_points_per_image]`. + * @returns {Promise} + */ + async _call(images, input_points = null, input_labels = null) { + // TODO allow user to use preprocessed images + /** @type {SamImageProcessorResult} */ + const processed = await super._call(images); + + if (input_points) { + processed.input_points = this.reshape_input_points( + input_points, + processed.original_sizes, + processed.reshaped_input_sizes, + ); + } + + if (input_labels) { + if (!processed.input_points) { + throw Error( + "`input_points` must be provided if `input_labels` are provided.", + ); + } + processed.input_labels = this.add_input_labels( + input_labels, + processed.input_points, + ); + } + + return processed; + } + + /** + * Remove padding and upscale masks to the original image size. + * @param {Tensor} masks Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format. + * @param {number[][]} original_sizes The original sizes of each image before it was resized to the model's expected input shape, in (height, width) format. + * @param {number[][]} reshaped_input_sizes The size of each image as it is fed to the model, in (height, width) format. Used to remove padding. + * @param {Object} options Optional parameters for post-processing. + * @param {number} [options.mask_threshold] The threshold to use for binarizing the masks. + * @param {boolean} [options.binarize] Whether to binarize the masks. + * @param {Object} [options.pad_size] The target size the images were padded to before being passed to the model. If `null`, the target size is assumed to be the processor's `pad_size`. + * @param {number} [options.pad_size.height] The height the images were padded to. + * @param {number} [options.pad_size.width] The width the images were padded to. + * @returns {Tensor[]} Batched masks in batch_size, num_channels, height, width) format, where (height, width) is given by original_size. + */ + post_process_masks( + masks, + original_sizes, + reshaped_input_sizes, + { mask_threshold = 0.0, binarize = true, pad_size = null } = {}, + ) { + // masks: [1, 1, 3, 256, 256] + + const output_masks = []; + + pad_size = pad_size ?? this.pad_size; + + const target_image_size = [pad_size.height, pad_size.width]; + + for (let i = 0; i < original_sizes.length; ++i) { + const original_size = original_sizes[i]; + const reshaped_input_size = reshaped_input_sizes[i]; + + const mask = masks[i]; // [b, c, h, w] + + // TODO: improve + const interpolated_masks = []; + for (let j = 0; j < mask.dims[0]; ++j) { + const m = mask[j]; // 3d tensor + + // Upscale mask to padded size + let interpolated_mask = interpolate( + m, + target_image_size, + "bilinear", + false, + ); + + // Crop mask + interpolated_mask = interpolated_mask.slice( + null, + [0, reshaped_input_size[0]], + [0, reshaped_input_size[1]], + ); + + // Downscale mask + interpolated_mask = interpolate( + interpolated_mask, + original_size, + "bilinear", + false, + ); + + if (binarize) { + const binarizedMaskData = new Uint8Array( + interpolated_mask.data.length, + ); + for (let i = 0; i < interpolated_mask.data.length; ++i) { + if (interpolated_mask.data[i] > mask_threshold) { + binarizedMaskData[i] = 1; } + } + interpolated_mask = new Tensor( + "bool", + binarizedMaskData, + interpolated_mask.dims, + ); } - return new Tensor( - 'float32', - Float32Array.from(input_points.flat(Infinity)), - shape - ) + interpolated_masks.push(interpolated_mask); + } + output_masks.push(stack(interpolated_masks)); } - /** - * - * @param {any} input_labels - * @param {Tensor} input_points - * @returns {Tensor} - */ - add_input_labels(input_labels, input_points) { - let shape = calculateDimensions(input_labels); - if (shape.length === 2) { - // Correct user's input - shape = [1, ...shape]; - input_labels = [input_labels]; - } else if (shape.length !== 3) { - throw Error("The input_points must be a 4D tensor of shape `batch_size`, `point_batch_size`, `nb_points_per_image`, `2`.") - } - - if (shape.some((x, i) => x !== input_points.dims[i])) { - throw Error(`The first ${shape.length} dimensions of 'input_points' and 'input_labels' must be the same.`) - } - return new Tensor( - 'int64', - input_labels.flat(Infinity).map(BigInt), - shape, - ) - } - /** - * @param {any[]} images The URL(s) of the image(s) to extract features from. - * @param {any} [input_points] A 3D or 4D array, representing the input points provided by the user. - * - 3D: `[point_batch_size, nb_points_per_image, 2]`. In this case, `batch_size` is assumed to be 1. - * - 4D: `[batch_size, point_batch_size, nb_points_per_image, 2]`. - * @param {any} [input_labels] A 2D or 3D array, representing the input labels for the points, used by the prompt encoder to encode the prompt. - * - 2D: `[point_batch_size, nb_points_per_image]`. In this case, `batch_size` is assumed to be 1. - * - 3D: `[batch_size, point_batch_size, nb_points_per_image]`. - * @returns {Promise} - */ - async _call(images, input_points = null, input_labels = null) { - // TODO allow user to use preprocessed images - /** @type {SamImageProcessorResult} */ - const processed = await super._call(images); - - if (input_points) { - processed.input_points = this.reshape_input_points( - input_points, processed.original_sizes, processed.reshaped_input_sizes - ); - } - - if (input_labels) { - if (!processed.input_points) { - throw Error("`input_points` must be provided if `input_labels` are provided.") - } - processed.input_labels = this.add_input_labels(input_labels, processed.input_points); - } - - return processed; - } - - /** - * Remove padding and upscale masks to the original image size. - * @param {Tensor} masks Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format. - * @param {number[][]} original_sizes The original sizes of each image before it was resized to the model's expected input shape, in (height, width) format. - * @param {number[][]} reshaped_input_sizes The size of each image as it is fed to the model, in (height, width) format. Used to remove padding. - * @param {Object} options Optional parameters for post-processing. - * @param {number} [options.mask_threshold] The threshold to use for binarizing the masks. - * @param {boolean} [options.binarize] Whether to binarize the masks. - * @param {Object} [options.pad_size] The target size the images were padded to before being passed to the model. If `null`, the target size is assumed to be the processor's `pad_size`. - * @param {number} [options.pad_size.height] The height the images were padded to. - * @param {number} [options.pad_size.width] The width the images were padded to. - * @returns {Tensor[]} Batched masks in batch_size, num_channels, height, width) format, where (height, width) is given by original_size. - */ - post_process_masks(masks, original_sizes, reshaped_input_sizes, { - mask_threshold = 0.0, - binarize = true, - pad_size = null, - } = {}) { - // masks: [1, 1, 3, 256, 256] - - const output_masks = []; - - pad_size = pad_size ?? this.pad_size; - - const target_image_size = [pad_size.height, pad_size.width]; - - for (let i = 0; i < original_sizes.length; ++i) { - const original_size = original_sizes[i]; - const reshaped_input_size = reshaped_input_sizes[i]; - - const mask = masks[i]; // [b, c, h, w] - - // TODO: improve - const interpolated_masks = []; - for (let j = 0; j < mask.dims[0]; ++j) { - const m = mask[j]; // 3d tensor - - // Upscale mask to padded size - let interpolated_mask = interpolate(m, target_image_size, 'bilinear', false); - - // Crop mask - interpolated_mask = interpolated_mask.slice(null, [0, reshaped_input_size[0]], [0, reshaped_input_size[1]]); - - // Downscale mask - interpolated_mask = interpolate(interpolated_mask, original_size, 'bilinear', false); - - if (binarize) { - const binarizedMaskData = new Uint8Array(interpolated_mask.data.length); - for (let i = 0; i < interpolated_mask.data.length; ++i) { - if (interpolated_mask.data[i] > mask_threshold) { - binarizedMaskData[i] = 1; - } - } - interpolated_mask = new Tensor( - 'bool', - binarizedMaskData, - interpolated_mask.dims - ) - } - - interpolated_masks.push(interpolated_mask); - } - - output_masks.push(stack(interpolated_masks)); - } - - return output_masks; - } + return output_masks; + } } export class Swin2SRImageProcessor extends ImageFeatureExtractor { - pad_image(pixelData, imgDims, padSize, options = {}) { - // NOTE: In this case, `padSize` represents the size of the sliding window for the local attention. - // In other words, the image is padded so that its width and height are multiples of `padSize`. - const [imageWidth, imageHeight, imageChannels] = imgDims; + pad_image(pixelData, imgDims, padSize, options = {}) { + // NOTE: In this case, `padSize` represents the size of the sliding window for the local attention. + // In other words, the image is padded so that its width and height are multiples of `padSize`. + const [imageWidth, imageHeight, imageChannels] = imgDims; - return super.pad_image(pixelData, imgDims, { - // NOTE: For Swin2SR models, the original python implementation adds padding even when the image's width/height is already - // a multiple of `pad_size`. However, this is most likely a bug (PR: https://github.com/mv-lab/swin2sr/pull/19). - // For this reason, we only add padding when the image's width/height is not a multiple of `pad_size`. - width: imageWidth + (padSize - imageWidth % padSize) % padSize, - height: imageHeight + (padSize - imageHeight % padSize) % padSize, - }, { - mode: 'symmetric', - center: false, - constant_values: -1, - ...options, - }) - } + return super.pad_image( + pixelData, + imgDims, + { + // NOTE: For Swin2SR models, the original python implementation adds padding even when the image's width/height is already + // a multiple of `pad_size`. However, this is most likely a bug (PR: https://github.com/mv-lab/swin2sr/pull/19). + // For this reason, we only add padding when the image's width/height is not a multiple of `pad_size`. + width: imageWidth + ((padSize - (imageWidth % padSize)) % padSize), + height: imageHeight + ((padSize - (imageHeight % padSize)) % padSize), + }, + { + mode: "symmetric", + center: false, + constant_values: -1, + ...options, + }, + ); + } } export class VitMatteImageProcessor extends ImageFeatureExtractor { - /** - * Calls the feature extraction process on an array of images, preprocesses - * each image, and concatenates the resulting features into a single Tensor. - * @param {RawImage[]} images The image(s) to extract features from. - * @param {RawImage[]} trimaps The trimaps(s) to extract features from. - * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. - */ - async _call(images, trimaps) { - if (!Array.isArray(images)) { - images = [images]; - } - if (!Array.isArray(trimaps)) { - trimaps = [trimaps]; - } - - const imageData = await Promise.all(images.map(x => this.preprocess(x))); - const trimapData = await Promise.all(trimaps.map(x => this.preprocess(x, { - do_normalize: false, - do_convert_rgb: false, - do_convert_grayscale: true, - }))); - - - // Stack pixel values - const pixel_values = stack(imageData.map( - // Concatenate images and trimaps - (x, i) => cat([x.pixel_values, trimapData[i].pixel_values], 0) - ), 0); - - return { - pixel_values: pixel_values, - - // Original sizes of images - original_sizes: imageData.map(x => x.original_size), - - // Reshaped sizes of images, before padding or cropping - reshaped_input_sizes: imageData.map(x => x.reshaped_input_size), - } + /** + * Calls the feature extraction process on an array of images, preprocesses + * each image, and concatenates the resulting features into a single Tensor. + * @param {RawImage[]} images The image(s) to extract features from. + * @param {RawImage[]} trimaps The trimaps(s) to extract features from. + * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. + */ + async _call(images, trimaps) { + if (!Array.isArray(images)) { + images = [images]; } + if (!Array.isArray(trimaps)) { + trimaps = [trimaps]; + } + + const imageData = await Promise.all(images.map((x) => this.preprocess(x))); + const trimapData = await Promise.all( + trimaps.map((x) => + this.preprocess(x, { + do_normalize: false, + do_convert_rgb: false, + do_convert_grayscale: true, + }), + ), + ); + + // Stack pixel values + const pixel_values = stack( + imageData.map( + // Concatenate images and trimaps + (x, i) => cat([x.pixel_values, trimapData[i].pixel_values], 0), + ), + 0, + ); + + return { + pixel_values: pixel_values, + + // Original sizes of images + original_sizes: imageData.map((x) => x.original_size), + + // Reshaped sizes of images, before padding or cropping + reshaped_input_sizes: imageData.map((x) => x.reshaped_input_size), + }; + } } export class WhisperFeatureExtractor extends FeatureExtractor { + constructor(config) { + super(config); - constructor(config) { - super(config); + // Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist. + this.config.mel_filters ??= mel_filter_bank( + Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins + this.config.feature_size, // num_mel_filters + 0.0, // min_frequency + 8000.0, // max_frequency + this.config.sampling_rate, // sampling_rate + "slaney", // norm + "slaney", // mel_scale + ); - // Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist. - this.config.mel_filters ??= mel_filter_bank( - Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins - this.config.feature_size, // num_mel_filters - 0.0, // min_frequency - 8000.0, // max_frequency - this.config.sampling_rate, // sampling_rate - "slaney", // norm - "slaney", // mel_scale - ); + this.window = window_function(this.config.n_fft, "hann"); + } - this.window = window_function(this.config.n_fft, 'hann'); + /** + * Computes the log-Mel spectrogram of the provided audio waveform. + * @param {Float32Array|Float64Array} waveform The audio waveform to process. + * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. + */ + _extract_fbank_features(waveform) { + const { data, dims } = spectrogram( + waveform, + this.window, // window + this.config.n_fft, // frame_length + this.config.hop_length, // hop_length + { + power: 2.0, + mel_filters: this.config.mel_filters, + log_mel: "log10", + + // Custom + max_num_frames: this.config.nb_max_frames, // 3000 + }, + ); + + const maxValue = max(data)[0]; + + for (let i = 0; i < data.length; ++i) { + data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0; } - /** - * Computes the log-Mel spectrogram of the provided audio waveform. - * @param {Float32Array|Float64Array} waveform The audio waveform to process. - * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. - */ - _extract_fbank_features(waveform) { - const { data, dims } = spectrogram( - waveform, - this.window, // window - this.config.n_fft, // frame_length - this.config.hop_length, // hop_length - { - power: 2.0, - mel_filters: this.config.mel_filters, - log_mel: 'log10', + return { data, dims }; + } - // Custom - max_num_frames: this.config.nb_max_frames, // 3000 - } - ) + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. + */ + async _call(audio) { + validate_audio_inputs(audio, "WhisperFeatureExtractor"); - const maxValue = max(data)[0]; - - for (let i = 0; i < data.length; ++i) { - data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0; - } - - return { data, dims }; + let waveform; + if (audio.length > this.config.n_samples) { + console.warn( + "Attempting to extract features for audio longer than 30 seconds. " + + "If using a pipeline to extract transcript from a long audio clip, " + + "remember to specify `chunk_length_s` and/or `stride_length_s`.", + ); + waveform = audio.slice(0, this.config.n_samples); + } else { + // pad with zeros + waveform = new Float32Array(this.config.n_samples); + waveform.set(audio); } - /** - * Asynchronously extracts features from a given audio using the provided configuration. - * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. - * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. - */ - async _call(audio) { - validate_audio_inputs(audio, 'WhisperFeatureExtractor'); + const { data, dims } = this._extract_fbank_features(waveform); - let waveform; - if (audio.length > this.config.n_samples) { - console.warn( - "Attempting to extract features for audio longer than 30 seconds. " + - "If using a pipeline to extract transcript from a long audio clip, " + - "remember to specify `chunk_length_s` and/or `stride_length_s`." - ); - waveform = audio.slice(0, this.config.n_samples); - } else { - // pad with zeros - waveform = new Float32Array(this.config.n_samples); - waveform.set(audio); - } - - const { data, dims } = this._extract_fbank_features(waveform); - - return { - input_features: new Tensor('float32', - data, - [1, ...dims] - ) - }; - } + return { + input_features: new Tensor("float32", data, [1, ...dims]), + }; + } } export class Wav2Vec2FeatureExtractor extends FeatureExtractor { + /** + * @param {Float32Array} input_values + * @returns {Float32Array} + */ + _zero_mean_unit_var_norm(input_values) { + // TODO support batch? + const sum = input_values.reduce((a, b) => a + b, 0); + const mean = sum / input_values.length; + const variance = + input_values.reduce((a, b) => a + (b - mean) ** 2, 0) / + input_values.length; + return input_values.map((x) => (x - mean) / Math.sqrt(variance + 1e-7)); + } - /** - * @param {Float32Array} input_values - * @returns {Float32Array} - */ - _zero_mean_unit_var_norm(input_values) { - // TODO support batch? - const sum = input_values.reduce((a, b) => a + b, 0); - const mean = sum / input_values.length; - const variance = input_values.reduce((a, b) => a + (b - mean) ** 2, 0) / input_values.length; - return input_values.map(x => (x - mean) / Math.sqrt(variance + 1e-7)); + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_values: Tensor; attention_mask: Tensor }>} A Promise resolving to an object containing the extracted input features and attention mask as Tensors. + */ + async _call(audio) { + validate_audio_inputs(audio, "Wav2Vec2FeatureExtractor"); + + if (audio instanceof Float64Array) { + audio = new Float32Array(audio); } - /** - * Asynchronously extracts features from a given audio using the provided configuration. - * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. - * @returns {Promise<{ input_values: Tensor; attention_mask: Tensor }>} A Promise resolving to an object containing the extracted input features and attention mask as Tensors. - */ - async _call(audio) { - validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor'); + let input_values = audio; - if (audio instanceof Float64Array) { - audio = new Float32Array(audio); - } - - let input_values = audio; - - // zero-mean and unit-variance normalization - if (this.config.do_normalize) { - input_values = this._zero_mean_unit_var_norm(input_values); - } - - // TODO: allow user to pass in attention mask - const shape = [1, input_values.length]; - return { - input_values: new Tensor('float32', input_values, shape), - attention_mask: new Tensor('int64', new BigInt64Array(input_values.length).fill(1n), shape) - }; + // zero-mean and unit-variance normalization + if (this.config.do_normalize) { + input_values = this._zero_mean_unit_var_norm(input_values); } + + // TODO: allow user to pass in attention mask + const shape = [1, input_values.length]; + return { + input_values: new Tensor("float32", input_values, shape), + attention_mask: new Tensor( + "int64", + new BigInt64Array(input_values.length).fill(1n), + shape, + ), + }; + } } export class ASTFeatureExtractor extends FeatureExtractor { + constructor(config) { + super(config); + const sampling_rate = this.config.sampling_rate; + const mel_filters = mel_filter_bank( + 256, // num_frequency_bins + this.config.num_mel_bins, // num_mel_filters + 20, // min_frequency + Math.floor(sampling_rate / 2), // max_frequency + sampling_rate, // sampling_rate + null, // norm + "kaldi", // mel_scale + true, // triangularize_in_mel_space + ); - constructor(config) { - super(config); + // Do padding: + for (let i = 0; i < mel_filters.length; ++i) { + mel_filters[i].push(0); + } + this.mel_filters = mel_filters; - const sampling_rate = this.config.sampling_rate; - const mel_filters = mel_filter_bank( - 256, // num_frequency_bins - this.config.num_mel_bins, // num_mel_filters - 20, // min_frequency - Math.floor(sampling_rate / 2), // max_frequency - sampling_rate, // sampling_rate - null, // norm - "kaldi", // mel_scale - true, // triangularize_in_mel_space - ); + this.window = window_function(400, "hann", { + periodic: false, + }); - // Do padding: - for (let i = 0; i < mel_filters.length; ++i) { - mel_filters[i].push(0); - } - this.mel_filters = mel_filters; + this.mean = this.config.mean; + this.std = this.config.std; + } - this.window = window_function(400, 'hann', { - periodic: false, - }) + /** + * Computes the log-Mel spectrogram of the provided audio waveform. + * @param {Float32Array|Float64Array} waveform The audio waveform to process. + * @param {number} max_length The maximum number of frames to return. + * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. + */ + _extract_fbank_features(waveform, max_length) { + // NOTE: We don't pad/truncate since that is passed in as `max_num_frames` + return spectrogram( + waveform, + this.window, // window + 400, // frame_length + 160, // hop_length + { + fft_length: 512, + power: 2.0, + center: false, + preemphasis: 0.97, + mel_filters: this.mel_filters, + log_mel: "log", + mel_floor: 1.192092955078125e-7, + remove_dc_offset: true, - this.mean = this.config.mean; - this.std = this.config.std; + // Custom + max_num_frames: max_length, + transpose: true, + }, + ); + } + + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_values: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. + */ + async _call(audio) { + validate_audio_inputs(audio, "ASTFeatureExtractor"); + + const features = this._extract_fbank_features( + audio, + this.config.max_length, + ); + if (this.config.do_normalize) { + // Normalize the input audio spectrogram to have mean=0, std=0.5 + const denom = this.std * 2; + for (let i = 0; i < features.data.length; ++i) { + features.data[i] = (features.data[i] - this.mean) / denom; + } } - /** - * Computes the log-Mel spectrogram of the provided audio waveform. - * @param {Float32Array|Float64Array} waveform The audio waveform to process. - * @param {number} max_length The maximum number of frames to return. - * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. - */ - _extract_fbank_features(waveform, max_length) { - // NOTE: We don't pad/truncate since that is passed in as `max_num_frames` - return spectrogram( - waveform, - this.window, // window - 400, // frame_length - 160, // hop_length - { - fft_length: 512, - power: 2.0, - center: false, - preemphasis: 0.97, - mel_filters: this.mel_filters, - log_mel: 'log', - mel_floor: 1.192092955078125e-07, - remove_dc_offset: true, - - // Custom - max_num_frames: max_length, - transpose: true, - } - ) - } - - - /** - * Asynchronously extracts features from a given audio using the provided configuration. - * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. - * @returns {Promise<{ input_values: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. - */ - async _call(audio) { - validate_audio_inputs(audio, 'ASTFeatureExtractor'); - - const features = this._extract_fbank_features(audio, this.config.max_length); - if (this.config.do_normalize) { - // Normalize the input audio spectrogram to have mean=0, std=0.5 - const denom = this.std * 2; - for (let i = 0; i < features.data.length; ++i) { - features.data[i] = (features.data[i] - this.mean) / denom; - } - } - - return { - input_values: new Tensor('float32', - features.data, - [1, ...features.dims] - ) - }; - } + return { + input_values: new Tensor("float32", features.data, [1, ...features.dims]), + }; + } } export class ClapFeatureExtractor extends FeatureExtractor { + constructor(config) { + super(config); - constructor(config) { - super(config); + this.mel_filters = mel_filter_bank( + this.config.nb_frequency_bins, // num_frequency_bins + this.config.feature_size, // num_mel_filters + this.config.frequency_min, // min_frequency + this.config.frequency_max, // max_frequency + this.config.sampling_rate, // sampling_rate + null, // norm + "htk", // mel_scale + ); - this.mel_filters = mel_filter_bank( - this.config.nb_frequency_bins, // num_frequency_bins - this.config.feature_size, // num_mel_filters - this.config.frequency_min, // min_frequency - this.config.frequency_max, // max_frequency - this.config.sampling_rate, // sampling_rate - null, // norm - "htk", // mel_scale + this.mel_filters_slaney = mel_filter_bank( + this.config.nb_frequency_bins, // num_frequency_bins + this.config.feature_size, // num_mel_filters + this.config.frequency_min, // min_frequency + this.config.frequency_max, // max_frequency + this.config.sampling_rate, // sampling_rate + "slaney", // norm + "slaney", // mel_scale + ); + + this.window = window_function(this.config.fft_window_size, "hann"); + } + + /** + * Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments. + * + * Four different path are possible: + * - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram + * will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram + * are then stacked together. They will later be used for `feature_fusion`. + * - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is + * padded based on `padding`. + * - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded + * based on `padding`, and is repeated `4` times. + * - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel + * spectrogram will be computed on a random crop of the waveform. + * + * @param {Float32Array|Float64Array} waveform The input waveform. + * @param {number} max_length The maximum length of the waveform. + * @param {string} truncation The truncation strategy to use. + * @param {string} padding The padding strategy to use. + * @returns {{ data: Float32Array; dims: number[]; longer: boolean; }} An object containing the mel spectrogram data as a Float32Array, its dimensions as an array of numbers, and a boolean indicating whether the waveform was longer than the max length. + */ + _get_input_mel(waveform, max_length, truncation, padding) { + /** @type {{ data: Float32Array; dims: number[]}} */ + let input_mel; + let longer = false; + const diff = waveform.length - max_length; + if (diff > 0) { + if (truncation === "rand_trunc") { + longer = true; + const idx = Math.floor(Math.random() * (diff + 1)); + waveform = waveform.subarray(idx, idx + max_length); + + input_mel = this._extract_fbank_features( + waveform, + this.mel_filters_slaney, + this.config.nb_max_samples, ); + input_mel.dims = [1, ...input_mel.dims]; // "unsqueeze" + } else { + // TODO implement fusion strategy + throw new Error(`Truncation strategy "${truncation}" not implemented`); + } + } else { + if (diff < 0) { + let padded = new Float64Array(max_length); // already padded with zeros + padded.set(waveform); - this.mel_filters_slaney = mel_filter_bank( - this.config.nb_frequency_bins, // num_frequency_bins - this.config.feature_size, // num_mel_filters - this.config.frequency_min, // min_frequency - this.config.frequency_max, // max_frequency - this.config.sampling_rate, // sampling_rate - "slaney", // norm - "slaney", // mel_scale - ); - - this.window = window_function(this.config.fft_window_size, 'hann') - - } - - - /** - * Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments. - * - * Four different path are possible: - * - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram - * will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram - * are then stacked together. They will later be used for `feature_fusion`. - * - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is - * padded based on `padding`. - * - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded - * based on `padding`, and is repeated `4` times. - * - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel - * spectrogram will be computed on a random crop of the waveform. - * - * @param {Float32Array|Float64Array} waveform The input waveform. - * @param {number} max_length The maximum length of the waveform. - * @param {string} truncation The truncation strategy to use. - * @param {string} padding The padding strategy to use. - * @returns {{ data: Float32Array; dims: number[]; longer: boolean; }} An object containing the mel spectrogram data as a Float32Array, its dimensions as an array of numbers, and a boolean indicating whether the waveform was longer than the max length. - */ - _get_input_mel(waveform, max_length, truncation, padding) { - - /** @type {{ data: Float32Array; dims: number[]}} */ - let input_mel; - let longer = false; - const diff = waveform.length - max_length; - if (diff > 0) { - if (truncation === 'rand_trunc') { - longer = true; - const idx = Math.floor(Math.random() * (diff + 1)); - waveform = waveform.subarray(idx, idx + max_length); - - input_mel = this._extract_fbank_features(waveform, this.mel_filters_slaney, this.config.nb_max_samples); - input_mel.dims = [1, ...input_mel.dims]; // "unsqueeze" - } else { - // TODO implement fusion strategy - throw new Error(`Truncation strategy "${truncation}" not implemented`) - } - } else { - if (diff < 0) { - let padded = new Float64Array(max_length); // already padded with zeros - padded.set(waveform); - - if (padding === 'repeat') { - for (let i = waveform.length; i < max_length; i += waveform.length) { - padded.set(waveform.subarray(0, Math.min(waveform.length, max_length - i)), i); - } - } else if (padding === 'repeatpad') { - for (let i = waveform.length; i < -diff; i += waveform.length) { - padded.set(waveform, i); - } - } - waveform = padded; - } - - if (truncation === 'fusion') { - throw new Error(`Truncation strategy "${truncation}" not implemented`) - } - - input_mel = this._extract_fbank_features(waveform, this.mel_filters_slaney, this.config.nb_max_samples); - input_mel.dims = [1, ...input_mel.dims]; // "unsqueeze" + if (padding === "repeat") { + for (let i = waveform.length; i < max_length; i += waveform.length) { + padded.set( + waveform.subarray(0, Math.min(waveform.length, max_length - i)), + i, + ); + } + } else if (padding === "repeatpad") { + for (let i = waveform.length; i < -diff; i += waveform.length) { + padded.set(waveform, i); + } } + waveform = padded; + } - return { - ...input_mel, - longer, - } + if (truncation === "fusion") { + throw new Error(`Truncation strategy "${truncation}" not implemented`); + } + + input_mel = this._extract_fbank_features( + waveform, + this.mel_filters_slaney, + this.config.nb_max_samples, + ); + input_mel.dims = [1, ...input_mel.dims]; // "unsqueeze" } - /** - * Compute the log-mel spectrogram of the provided `waveform` using the Hann window. - * In CLAP, two different filter banks are used depending on the truncation pattern: - * - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from - * calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation` - * is set to `"fusion"`. - * - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used - * `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original - * implementation when the truncation mode is not `"fusion"`. - * - * @param {Float32Array|Float64Array} waveform The audio waveform to process. - * @param {number[][]} mel_filters The mel filters to use. - * @param {number} [max_length=null] The maximum number of frames to return. - * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. - */ - _extract_fbank_features(waveform, mel_filters, max_length = null) { - // NOTE: We don't pad/truncate since that is passed in as `max_num_frames` - return spectrogram( - waveform, - this.window, // window - this.config.fft_window_size, // frame_length - this.config.hop_length, // hop_length - { - power: 2.0, - mel_filters, - log_mel: 'dB', + return { + ...input_mel, + longer, + }; + } - // Custom - max_num_frames: max_length, - do_pad: false, - transpose: true, - } - ) - } + /** + * Compute the log-mel spectrogram of the provided `waveform` using the Hann window. + * In CLAP, two different filter banks are used depending on the truncation pattern: + * - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from + * calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation` + * is set to `"fusion"`. + * - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used + * `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original + * implementation when the truncation mode is not `"fusion"`. + * + * @param {Float32Array|Float64Array} waveform The audio waveform to process. + * @param {number[][]} mel_filters The mel filters to use. + * @param {number} [max_length=null] The maximum number of frames to return. + * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. + */ + _extract_fbank_features(waveform, mel_filters, max_length = null) { + // NOTE: We don't pad/truncate since that is passed in as `max_num_frames` + return spectrogram( + waveform, + this.window, // window + this.config.fft_window_size, // frame_length + this.config.hop_length, // hop_length + { + power: 2.0, + mel_filters, + log_mel: "dB", + // Custom + max_num_frames: max_length, + do_pad: false, + transpose: true, + }, + ); + } - /** - * Asynchronously extracts features from a given audio using the provided configuration. - * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. - * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. - */ - async _call(audio, { - max_length = null, - } = {}) { - validate_audio_inputs(audio, 'ClapFeatureExtractor'); + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. + */ + async _call(audio, { max_length = null } = {}) { + validate_audio_inputs(audio, "ClapFeatureExtractor"); - // convert to mel spectrogram, truncate and pad if needed. - const padded_inputs = this._get_input_mel( - audio, - max_length ?? this.config.nb_max_samples, - this.config.truncation, - this.config.padding, - ); + // convert to mel spectrogram, truncate and pad if needed. + const padded_inputs = this._get_input_mel( + audio, + max_length ?? this.config.nb_max_samples, + this.config.truncation, + this.config.padding, + ); - - return { - input_features: new Tensor('float32', - padded_inputs.data, - [1, ...padded_inputs.dims] - ) - }; - } + return { + input_features: new Tensor("float32", padded_inputs.data, [ + 1, + ...padded_inputs.dims, + ]), + }; + } } - - -export class SpeechT5FeatureExtractor extends FeatureExtractor { } +export class SpeechT5FeatureExtractor extends FeatureExtractor {} /** * Represents a Processor that extracts features from an input. * @extends Callable */ export class Processor extends Callable { - /** - * Creates a new Processor with the given feature extractor. - * @param {FeatureExtractor} feature_extractor The function used to extract features from the input. - */ - constructor(feature_extractor) { - super(); - this.feature_extractor = feature_extractor; - // TODO use tokenizer here? - } + /** + * Creates a new Processor with the given feature extractor. + * @param {FeatureExtractor} feature_extractor The function used to extract features from the input. + */ + constructor(feature_extractor) { + super(); + this.feature_extractor = feature_extractor; + // TODO use tokenizer here? + } - /** - * Calls the feature_extractor function with the given input. - * @param {any} input The input to extract features from. - * @param {...any} args Additional arguments. - * @returns {Promise} A Promise that resolves with the extracted features. - */ - async _call(input, ...args) { - return await this.feature_extractor(input, ...args); - } + /** + * Calls the feature_extractor function with the given input. + * @param {any} input The input to extract features from. + * @param {...any} args Additional arguments. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + async _call(input, ...args) { + return await this.feature_extractor(input, ...args); + } } export class SamProcessor extends Processor { - /** - * @borrows SamImageProcessor#_call as _call - */ - async _call(...args) { - return await this.feature_extractor(...args); - } + /** + * @borrows SamImageProcessor#_call as _call + */ + async _call(...args) { + return await this.feature_extractor(...args); + } - /** - * @borrows SamImageProcessor#post_process_masks as post_process_masks - */ - post_process_masks(...args) { - // @ts-ignore - return this.feature_extractor.post_process_masks(...args); - } - /** - * @borrows SamImageProcessor#reshape_input_points as reshape_input_points - */ - reshape_input_points(...args) { - // @ts-ignore - return this.feature_extractor.reshape_input_points(...args); - } + /** + * @borrows SamImageProcessor#post_process_masks as post_process_masks + */ + post_process_masks(...args) { + // @ts-ignore + return this.feature_extractor.post_process_masks(...args); + } + /** + * @borrows SamImageProcessor#reshape_input_points as reshape_input_points + */ + reshape_input_points(...args) { + // @ts-ignore + return this.feature_extractor.reshape_input_points(...args); + } } /** @@ -1801,52 +1888,50 @@ export class SamProcessor extends Processor { * @extends Processor */ export class WhisperProcessor extends Processor { - /** - * Calls the feature_extractor function with the given audio input. - * @param {any} audio The audio input to extract features from. - * @returns {Promise} A Promise that resolves with the extracted features. - */ - async _call(audio) { - return await this.feature_extractor(audio) - } + /** + * Calls the feature_extractor function with the given audio input. + * @param {any} audio The audio input to extract features from. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + async _call(audio) { + return await this.feature_extractor(audio); + } } - export class Wav2Vec2ProcessorWithLM extends Processor { - /** - * Calls the feature_extractor function with the given audio input. - * @param {any} audio The audio input to extract features from. - * @returns {Promise} A Promise that resolves with the extracted features. - */ - async _call(audio) { - return await this.feature_extractor(audio) - } + /** + * Calls the feature_extractor function with the given audio input. + * @param {any} audio The audio input to extract features from. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + async _call(audio) { + return await this.feature_extractor(audio); + } } export class SpeechT5Processor extends Processor { - /** - * Calls the feature_extractor function with the given input. - * @param {any} input The input to extract features from. - * @returns {Promise} A Promise that resolves with the extracted features. - */ - async _call(input) { - return await this.feature_extractor(input) - } + /** + * Calls the feature_extractor function with the given input. + * @param {any} input The input to extract features from. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + async _call(input) { + return await this.feature_extractor(input); + } } -export class OwlViTProcessor extends Processor { } - +export class OwlViTProcessor extends Processor {} ////////////////////////////////////////////////// /** * Helper class which is used to instantiate pretrained processors with the `from_pretrained` function. * The chosen processor class is determined by the type specified in the processor config. - * + * * **Example:** Load a processor using `from_pretrained`. * ```javascript * let processor = await AutoProcessor.from_pretrained('openai/whisper-tiny.en'); * ``` - * + * * **Example:** Run an image through a processor. * ```javascript * let processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16'); @@ -1869,98 +1954,112 @@ export class OwlViTProcessor extends Processor { } * ``` */ export class AutoProcessor { - static FEATURE_EXTRACTOR_CLASS_MAPPING = { - WhisperFeatureExtractor, - ViTFeatureExtractor, - MobileViTFeatureExtractor, - OwlViTFeatureExtractor, - CLIPFeatureExtractor, - ChineseCLIPFeatureExtractor, - SiglipImageProcessor, - ConvNextFeatureExtractor, - ConvNextImageProcessor, - SegformerFeatureExtractor, - BitImageProcessor, - DPTFeatureExtractor, - GLPNFeatureExtractor, - BeitFeatureExtractor, - DeiTFeatureExtractor, - DetrFeatureExtractor, - YolosFeatureExtractor, - DonutFeatureExtractor, - NougatImageProcessor, + static FEATURE_EXTRACTOR_CLASS_MAPPING = { + WhisperFeatureExtractor, + ViTFeatureExtractor, + MobileViTFeatureExtractor, + OwlViTFeatureExtractor, + CLIPFeatureExtractor, + ChineseCLIPFeatureExtractor, + SiglipImageProcessor, + ConvNextFeatureExtractor, + ConvNextImageProcessor, + SegformerFeatureExtractor, + BitImageProcessor, + DPTFeatureExtractor, + GLPNFeatureExtractor, + BeitFeatureExtractor, + DeiTFeatureExtractor, + DetrFeatureExtractor, + YolosFeatureExtractor, + DonutFeatureExtractor, + NougatImageProcessor, - ViTImageProcessor, - VitMatteImageProcessor, - SamImageProcessor, - Swin2SRImageProcessor, - Wav2Vec2FeatureExtractor, - SpeechT5FeatureExtractor, - ASTFeatureExtractor, - ClapFeatureExtractor, + ViTImageProcessor, + VitMatteImageProcessor, + SamImageProcessor, + Swin2SRImageProcessor, + Wav2Vec2FeatureExtractor, + SpeechT5FeatureExtractor, + ASTFeatureExtractor, + ClapFeatureExtractor, + }; + + static PROCESSOR_CLASS_MAPPING = { + WhisperProcessor, + Wav2Vec2ProcessorWithLM, + SamProcessor, + SpeechT5Processor, + OwlViTProcessor, + }; + + /** + * Instantiate one of the processor classes of the library from a pretrained model. + * + * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object + * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) + * + * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: + * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co. + * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + * user or organization name, like `dbmdz/bert-base-german-cased`. + * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`. + * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the processor. + * + * @returns {Promise} A new instance of the Processor class. + */ + static async from_pretrained( + pretrained_model_name_or_path, + { + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = "main", + } = {}, + ) { + let preprocessorConfig = + config ?? + (await getModelJSON( + pretrained_model_name_or_path, + "preprocessor_config.json", + true, + { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + }, + )); + + // Determine feature extractor class + // TODO: Ensure backwards compatibility with old configs + let key = + preprocessorConfig.feature_extractor_type ?? + preprocessorConfig.image_processor_type; + let feature_extractor_class = this.FEATURE_EXTRACTOR_CLASS_MAPPING[key]; + + if (!feature_extractor_class) { + if (preprocessorConfig.size !== undefined) { + // Assume ImageFeatureExtractor + console.warn( + `Feature extractor type "${key}" not found, assuming ImageFeatureExtractor due to size parameter in config.`, + ); + feature_extractor_class = ImageFeatureExtractor; + } else { + throw new Error(`Unknown Feature Extractor type: ${key}`); + } } - static PROCESSOR_CLASS_MAPPING = { - WhisperProcessor, - Wav2Vec2ProcessorWithLM, - SamProcessor, - SpeechT5Processor, - OwlViTProcessor, - } + // If no associated processor class, use default + let processor_class = + this.PROCESSOR_CLASS_MAPPING[preprocessorConfig.processor_class] ?? + Processor; - /** - * Instantiate one of the processor classes of the library from a pretrained model. - * - * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object - * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) - * - * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: - * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co. - * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a - * user or organization name, like `dbmdz/bert-base-german-cased`. - * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`. - * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the processor. - * - * @returns {Promise} A new instance of the Processor class. - */ - static async from_pretrained(pretrained_model_name_or_path, { - progress_callback = null, - config = null, - cache_dir = null, - local_files_only = false, - revision = 'main', - } = {}) { - - let preprocessorConfig = config ?? await getModelJSON(pretrained_model_name_or_path, 'preprocessor_config.json', true, { - progress_callback, - config, - cache_dir, - local_files_only, - revision, - }) - - // Determine feature extractor class - // TODO: Ensure backwards compatibility with old configs - let key = preprocessorConfig.feature_extractor_type ?? preprocessorConfig.image_processor_type; - let feature_extractor_class = this.FEATURE_EXTRACTOR_CLASS_MAPPING[key]; - - if (!feature_extractor_class) { - if (preprocessorConfig.size !== undefined) { - // Assume ImageFeatureExtractor - console.warn(`Feature extractor type "${key}" not found, assuming ImageFeatureExtractor due to size parameter in config.`); - feature_extractor_class = ImageFeatureExtractor; - } else { - throw new Error(`Unknown Feature Extractor type: ${key}`); - } - } - - // If no associated processor class, use default - let processor_class = this.PROCESSOR_CLASS_MAPPING[preprocessorConfig.processor_class] ?? Processor; - - // Instantiate processor and feature extractor - let feature_extractor = new feature_extractor_class(preprocessorConfig); - return new processor_class(feature_extractor); - } + // Instantiate processor and feature extractor + let feature_extractor = new feature_extractor_class(preprocessorConfig); + return new processor_class(feature_extractor); + } } ////////////////////////////////////////////////// - diff --git a/core/vendor/modules/@xenova/transformers/src/tokenizers.js b/core/vendor/modules/@xenova/transformers/src/tokenizers.js index a786cd1ed..6bed31a17 100644 --- a/core/vendor/modules/@xenova/transformers/src/tokenizers.js +++ b/core/vendor/modules/@xenova/transformers/src/tokenizers.js @@ -1,12 +1,11 @@ - /** * @file Tokenizers are used to prepare textual inputs for a model. - * + * * **Example:** Create an `AutoTokenizer` and use it to tokenize a sentence. * This will automatically detect the tokenizer type based on the tokenizer class defined in `tokenizer.json`. * ```javascript * import { AutoTokenizer } from '@xenova/transformers'; - * + * * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/bert-base-uncased'); * const { input_ids } = await tokenizer('I love transformers!'); * // Tensor { @@ -16,33 +15,30 @@ * // size: 6, * // } * ``` - * + * * @module tokenizers */ import { - Callable, - reverseDictionary, - escapeRegExp, - isIntegralNumber, - mergeArrays, -} from './utils/core.js'; + Callable, + reverseDictionary, + escapeRegExp, + isIntegralNumber, + mergeArrays, +} from "./utils/core.js"; + +import { getModelJSON } from "./utils/hub.js"; + +import { max, min, round } from "./utils/maths.js"; +import { Tensor } from "./utils/tensor.js"; import { - getModelJSON, -} from './utils/hub.js'; - -import { max, min, round } from './utils/maths.js'; -import { Tensor } from './utils/tensor.js'; - -import { - PriorityQueue, - TokenLattice, - CharTrie, -} from './utils/data-structures.js'; - -import { Template } from '@huggingface/jinja'; + PriorityQueue, + TokenLattice, + CharTrie, +} from "./utils/data-structures.js"; +import { Template } from "@huggingface/jinja"; /** * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties. @@ -57,20 +53,28 @@ import { Template } from '@huggingface/jinja'; * @returns {Promise} A promise that resolves with information about the loaded tokenizer. */ async function loadTokenizer(pretrained_model_name_or_path, options) { + const info = await Promise.all([ + getModelJSON( + pretrained_model_name_or_path, + "tokenizer.json", + true, + options, + ), + getModelJSON( + pretrained_model_name_or_path, + "tokenizer_config.json", + true, + options, + ), + ]); - const info = await Promise.all([ - getModelJSON(pretrained_model_name_or_path, 'tokenizer.json', true, options), - getModelJSON(pretrained_model_name_or_path, 'tokenizer_config.json', true, options), - ]) - - // Override legacy option if `options.legacy` is not null - if (options.legacy !== null) { - info[1].legacy = options.legacy; - } - return info; + // Override legacy option if `options.legacy` is not null + if (options.legacy !== null) { + info[1].legacy = options.legacy; + } + return info; } - /** * Helper function to split a string on a regex, but keep the delimiters. * This is required, because the JavaScript `.split()` method does not keep the delimiters, @@ -80,25 +84,24 @@ async function loadTokenizer(pretrained_model_name_or_path, options) { * @returns {string[]} The split string. */ function regexSplit(text, regex) { - const result = []; - let prev = 0; - for (const match of text.matchAll(regex)) { - const fullMatch = match[0]; - if (prev < match.index) { - result.push(text.slice(prev, match.index)); - } - if (fullMatch.length > 0) { - result.push(fullMatch); - } - prev = match.index + fullMatch.length; + const result = []; + let prev = 0; + for (const match of text.matchAll(regex)) { + const fullMatch = match[0]; + if (prev < match.index) { + result.push(text.slice(prev, match.index)); } - if (prev < text.length) { - result.push(text.slice(prev)); + if (fullMatch.length > 0) { + result.push(fullMatch); } - return result; + prev = match.index + fullMatch.length; + } + if (prev < text.length) { + result.push(text.slice(prev)); + } + return result; } - /** * Helper method to construct a pattern from a config object. * @param {Object} pattern The pattern object. @@ -106,25 +109,22 @@ function regexSplit(text, regex) { * @returns {RegExp|null} The compiled pattern. */ function createPattern(pattern, invert = true) { - - if (pattern.Regex !== undefined) { - // In certain cases, the pattern may contain unnecessary escape sequences (e.g., \# or \& or \~). - // i.e., valid in Python (where the patterns are exported from) but invalid in JavaScript (where the patterns are parsed). - // This isn't an issue when creating the regex w/o the 'u' flag, but it is when the 'u' flag is used. - // For this reason, it is necessary to remove these backslashes before creating the regex. - // See https://stackoverflow.com/a/63007777/13989043 for more information - const regex = pattern.Regex.replace(/\\([#&~])/g, '$1'); // TODO: add more characters to this list if necessary - return new RegExp(regex, 'gu'); - - } else if (pattern.String !== undefined) { - const escaped = escapeRegExp(pattern.String); - // NOTE: if invert is true, we wrap the pattern in a group so that it is kept when performing .split() - return new RegExp(invert ? escaped : `(${escaped})`, 'gu'); - - } else { - console.warn('Unknown pattern type:', pattern) - return null; - } + if (pattern.Regex !== undefined) { + // In certain cases, the pattern may contain unnecessary escape sequences (e.g., \# or \& or \~). + // i.e., valid in Python (where the patterns are exported from) but invalid in JavaScript (where the patterns are parsed). + // This isn't an issue when creating the regex w/o the 'u' flag, but it is when the 'u' flag is used. + // For this reason, it is necessary to remove these backslashes before creating the regex. + // See https://stackoverflow.com/a/63007777/13989043 for more information + const regex = pattern.Regex.replace(/\\([#&~])/g, "$1"); // TODO: add more characters to this list if necessary + return new RegExp(regex, "gu"); + } else if (pattern.String !== undefined) { + const escaped = escapeRegExp(pattern.String); + // NOTE: if invert is true, we wrap the pattern in a group so that it is kept when performing .split() + return new RegExp(invert ? escaped : `(${escaped})`, "gu"); + } else { + console.warn("Unknown pattern type:", pattern); + return null; + } } /** @@ -133,7 +133,7 @@ function createPattern(pattern, invert = true) { * @returns {Map} The map. */ function objectToMap(obj) { - return new Map(Object.entries(obj)); + return new Map(Object.entries(obj)); } /** @@ -142,18 +142,22 @@ function objectToMap(obj) { * @returns {number[]} The tensor as a list. */ function prepareTensorForDecode(tensor) { - const dims = tensor.dims; - switch (dims.length) { - case 1: - return tensor.tolist(); - case 2: - if (dims[0] !== 1) { - throw new Error('Unable to decode tensor with `batch size !== 1`. Use `tokenizer.batch_decode(...)` for batched inputs.'); - } - return tensor.tolist()[0]; - default: - throw new Error(`Expected tensor to have 1-2 dimensions, got ${dims.length}.`) - } + const dims = tensor.dims; + switch (dims.length) { + case 1: + return tensor.tolist(); + case 2: + if (dims[0] !== 1) { + throw new Error( + "Unable to decode tensor with `batch size !== 1`. Use `tokenizer.batch_decode(...)` for batched inputs.", + ); + } + return tensor.tolist()[0]; + default: + throw new Error( + `Expected tensor to have 1-2 dimensions, got ${dims.length}.`, + ); + } } /** @@ -162,18 +166,19 @@ function prepareTensorForDecode(tensor) { * @returns {string} The cleaned up text. */ function clean_up_tokenization(text) { - // Clean up a list of simple English tokenization artifacts - // like spaces before punctuations and abbreviated forms - return text.replace(/ \./g, '.') - .replace(/ \?/g, '?') - .replace(/ \!/g, '!') - .replace(/ ,/g, ',') - .replace(/ \' /g, "'") - .replace(/ n\'t/g, "n't") - .replace(/ \'m/g, "'m") - .replace(/ \'s/g, "'s") - .replace(/ \'ve/g, "'ve") - .replace(/ \'re/g, "'re"); + // Clean up a list of simple English tokenization artifacts + // like spaces before punctuations and abbreviated forms + return text + .replace(/ \./g, ".") + .replace(/ \?/g, "?") + .replace(/ \!/g, "!") + .replace(/ ,/g, ",") + .replace(/ \' /g, "'") + .replace(/ n\'t/g, "n't") + .replace(/ \'m/g, "'m") + .replace(/ \'s/g, "'s") + .replace(/ \'ve/g, "'ve") + .replace(/ \'re/g, "'re"); } /** @@ -182,7 +187,7 @@ function clean_up_tokenization(text) { * @returns {string} The text with accents removed. */ function remove_accents(text) { - return text.replace(/[\u0300-\u036f]/g, ''); + return text.replace(/[\u0300-\u036f]/g, ""); } /** @@ -191,7 +196,7 @@ function remove_accents(text) { * @returns {string} The lowercased text with accents removed. */ function lowercase_and_remove_accent(text) { - return remove_accents(text.toLowerCase()); + return remove_accents(text.toLowerCase()); } /** @@ -201,21 +206,21 @@ function lowercase_and_remove_accent(text) { * @param {Map} mapping The mapping from input domain to value. */ function fuse(arr, value, mapping) { - const fused = []; - let i = 0; - while (i < arr.length) { - fused.push(arr[i]) - if ((mapping.get(arr[i]) ?? value) !== value) { - ++i; - continue; - } - - while (i < arr.length && (mapping.get(arr[i]) ?? value) === value) { - ++i; - } + const fused = []; + let i = 0; + while (i < arr.length) { + fused.push(arr[i]); + if ((mapping.get(arr[i]) ?? value) !== value) { + ++i; + continue; } - return fused; + while (i < arr.length && (mapping.get(arr[i]) ?? value) === value) { + ++i; + } + } + + return fused; } /** @@ -224,10 +229,11 @@ function fuse(arr, value, mapping) { * @returns {string[]} The split string. */ function whitespace_split(text) { - return text.match(/\S+/g) || []; + return text.match(/\S+/g) || []; } -const PUNCTUATION_REGEX = '\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E'; +const PUNCTUATION_REGEX = + "\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E"; /** * Represent a token added by the user on top of the existing Model vocabulary. @@ -236,26 +242,26 @@ const PUNCTUATION_REGEX = '\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\ * - Whether to include any whitespace on its left or right */ class AddedToken { - /** - * Creates a new instance of AddedToken. - * @param {Object} config Added token configuration object. - * @param {string} config.content The content of the added token. - * @param {number} config.id The id of the added token. - * @param {boolean} [config.single_word=false] Whether this token must be a single word or can break words. - * @param {boolean} [config.lstrip=false] Whether this token should strip whitespaces on its left. - * @param {boolean} [config.rstrip=false] Whether this token should strip whitespaces on its right. - * @param {boolean} [config.normalized=false] Whether this token should be normalized. - * @param {boolean} [config.special=false] Whether this token is special. - */ - constructor(config) { - this.content = config.content; - this.id = config.id; - this.single_word = config.single_word ?? false; - this.lstrip = config.lstrip ?? false; - this.rstrip = config.rstrip ?? false; - this.special = config.special ?? false; - this.normalized = config.normalized ?? null; - } + /** + * Creates a new instance of AddedToken. + * @param {Object} config Added token configuration object. + * @param {string} config.content The content of the added token. + * @param {number} config.id The id of the added token. + * @param {boolean} [config.single_word=false] Whether this token must be a single word or can break words. + * @param {boolean} [config.lstrip=false] Whether this token should strip whitespaces on its left. + * @param {boolean} [config.rstrip=false] Whether this token should strip whitespaces on its right. + * @param {boolean} [config.normalized=false] Whether this token should be normalized. + * @param {boolean} [config.special=false] Whether this token is special. + */ + constructor(config) { + this.content = config.content; + this.id = config.id; + this.single_word = config.single_word ?? false; + this.lstrip = config.lstrip ?? false; + this.rstrip = config.rstrip ?? false; + this.special = config.special ?? false; + this.normalized = config.normalized ?? null; + } } /** @@ -264,99 +270,99 @@ class AddedToken { * @extends Callable */ export class TokenizerModel extends Callable { - /** - * Creates a new instance of TokenizerModel. - * @param {Object} config The configuration object for the TokenizerModel. - */ - constructor(config) { - super(); - this.config = config; + /** + * Creates a new instance of TokenizerModel. + * @param {Object} config The configuration object for the TokenizerModel. + */ + constructor(config) { + super(); + this.config = config; - /** @type {string[]} */ - this.vocab = []; - - /** - * A mapping of tokens to ids. - * @type {Map} - */ - this.tokens_to_ids = new Map(); - - this.unk_token_id = undefined; - this.unk_token = undefined; - this.end_of_word_suffix = undefined; - - /** @type {boolean} Whether to fuse unknown tokens when encoding. Defaults to false. */ - this.fuse_unk = this.config.fuse_unk ?? false; - } + /** @type {string[]} */ + this.vocab = []; /** - * Instantiates a new TokenizerModel instance based on the configuration object provided. - * @param {Object} config The configuration object for the TokenizerModel. - * @param {...*} args Optional arguments to pass to the specific TokenizerModel constructor. - * @returns {TokenizerModel} A new instance of a TokenizerModel. - * @throws Will throw an error if the TokenizerModel type in the config is not recognized. + * A mapping of tokens to ids. + * @type {Map} */ - static fromConfig(config, ...args) { - switch (config.type) { - case 'WordPiece': - return new WordPieceTokenizer(config); - case 'Unigram': - // @ts-ignore - return new Unigram(config, ...args); + this.tokens_to_ids = new Map(); - case 'BPE': - return new BPE(config); + this.unk_token_id = undefined; + this.unk_token = undefined; + this.end_of_word_suffix = undefined; - default: - if (config.vocab) { - // @ts-ignore - return new LegacyTokenizerModel(config, ...args); - } - throw new Error(`Unknown TokenizerModel type: ${config.type}`); + /** @type {boolean} Whether to fuse unknown tokens when encoding. Defaults to false. */ + this.fuse_unk = this.config.fuse_unk ?? false; + } + + /** + * Instantiates a new TokenizerModel instance based on the configuration object provided. + * @param {Object} config The configuration object for the TokenizerModel. + * @param {...*} args Optional arguments to pass to the specific TokenizerModel constructor. + * @returns {TokenizerModel} A new instance of a TokenizerModel. + * @throws Will throw an error if the TokenizerModel type in the config is not recognized. + */ + static fromConfig(config, ...args) { + switch (config.type) { + case "WordPiece": + return new WordPieceTokenizer(config); + case "Unigram": + // @ts-ignore + return new Unigram(config, ...args); + + case "BPE": + return new BPE(config); + + default: + if (config.vocab) { + // @ts-ignore + return new LegacyTokenizerModel(config, ...args); } + throw new Error(`Unknown TokenizerModel type: ${config.type}`); } + } - /** - * Internal function to call the TokenizerModel instance. - * @param {string[]} tokens The tokens to encode. - * @returns {string[]} The encoded token IDs. - */ - _call(tokens) { - let ids = this.encode(tokens); - if (this.fuse_unk) { - // Fuse unknown tokens - ids = fuse(ids, this.unk_token_id, this.tokens_to_ids); - } - return ids; + /** + * Internal function to call the TokenizerModel instance. + * @param {string[]} tokens The tokens to encode. + * @returns {string[]} The encoded token IDs. + */ + _call(tokens) { + let ids = this.encode(tokens); + if (this.fuse_unk) { + // Fuse unknown tokens + ids = fuse(ids, this.unk_token_id, this.tokens_to_ids); } + return ids; + } - /** - * Encodes a list of tokens into a list of token IDs. - * @param {string[]} tokens The tokens to encode. - * @returns {string[]} The encoded tokens. - * @throws Will throw an error if not implemented in a subclass. - */ - encode(tokens) { - throw Error("encode should be implemented in subclass.") - } + /** + * Encodes a list of tokens into a list of token IDs. + * @param {string[]} tokens The tokens to encode. + * @returns {string[]} The encoded tokens. + * @throws Will throw an error if not implemented in a subclass. + */ + encode(tokens) { + throw Error("encode should be implemented in subclass."); + } - /** - * Converts a list of tokens into a list of token IDs. - * @param {string[]} tokens The tokens to convert. - * @returns {number[]} The converted token IDs. - */ - convert_tokens_to_ids(tokens) { - return tokens.map(t => this.tokens_to_ids.get(t) ?? this.unk_token_id); - } + /** + * Converts a list of tokens into a list of token IDs. + * @param {string[]} tokens The tokens to convert. + * @returns {number[]} The converted token IDs. + */ + convert_tokens_to_ids(tokens) { + return tokens.map((t) => this.tokens_to_ids.get(t) ?? this.unk_token_id); + } - /** - * Converts a list of token IDs into a list of tokens. - * @param {number[]} ids The token IDs to convert. - * @returns {string[]} The converted tokens. - */ - convert_ids_to_tokens(ids) { - return ids.map(i => this.vocab[i] ?? this.unk_token); - } + /** + * Converts a list of token IDs into a list of tokens. + * @param {number[]} ids The token IDs to convert. + * @returns {string[]} The converted tokens. + */ + convert_ids_to_tokens(ids) { + return ids.map((i) => this.vocab[i] ?? this.unk_token); + } } /** @@ -364,100 +370,99 @@ export class TokenizerModel extends Callable { * @extends TokenizerModel */ class WordPieceTokenizer extends TokenizerModel { + /** + * @param {Object} config The configuration object. + * @param {Object} config.vocab A mapping of tokens to ids. + * @param {string} config.unk_token The unknown token string. + * @param {string} config.continuing_subword_prefix The prefix to use for continuing subwords. + * @param {number} [config.max_input_chars_per_word=100] The maximum number of characters per word. + */ + constructor(config) { + super(config); /** - * @param {Object} config The configuration object. - * @param {Object} config.vocab A mapping of tokens to ids. - * @param {string} config.unk_token The unknown token string. - * @param {string} config.continuing_subword_prefix The prefix to use for continuing subwords. - * @param {number} [config.max_input_chars_per_word=100] The maximum number of characters per word. + * A mapping of tokens to ids. + * @type {Map} */ - constructor(config) { - super(config); - /** - * A mapping of tokens to ids. - * @type {Map} - */ - this.tokens_to_ids = objectToMap(config.vocab); - - /** - * The id of the unknown token. - * @type {number} - */ - this.unk_token_id = this.tokens_to_ids.get(config.unk_token); - - /** - * The unknown token string. - * @type {string} - */ - this.unk_token = config.unk_token; - - /** - * The maximum number of characters allowed per word. - * @type {number} - */ - this.max_input_chars_per_word = config.max_input_chars_per_word ?? 100; - - /** - * An array of tokens. - * @type {string[]} - */ - this.vocab = new Array(this.tokens_to_ids.size); - for (const [key, value] of this.tokens_to_ids) { - this.vocab[value] = key; - } - } + this.tokens_to_ids = objectToMap(config.vocab); /** - * Encodes an array of tokens using WordPiece encoding. - * @param {string[]} tokens The tokens to encode. - * @returns {string[]} An array of encoded tokens. + * The id of the unknown token. + * @type {number} */ - encode(tokens) { - const outputTokens = []; - for (const token of tokens) { - const chars = [...token]; - if (chars.length > this.max_input_chars_per_word) { - outputTokens.push(this.unk_token); - continue; - } + this.unk_token_id = this.tokens_to_ids.get(config.unk_token); - let isUnknown = false; - let start = 0; - const subTokens = []; + /** + * The unknown token string. + * @type {string} + */ + this.unk_token = config.unk_token; - while (start < chars.length) { - let end = chars.length; - let currentSubstring = null; - while (start < end) { - let substr = chars.slice(start, end).join(''); + /** + * The maximum number of characters allowed per word. + * @type {number} + */ + this.max_input_chars_per_word = config.max_input_chars_per_word ?? 100; - if (start > 0) { - substr = this.config.continuing_subword_prefix + substr; - } - if (this.tokens_to_ids.has(substr)) { - currentSubstring = substr; - break; - } + /** + * An array of tokens. + * @type {string[]} + */ + this.vocab = new Array(this.tokens_to_ids.size); + for (const [key, value] of this.tokens_to_ids) { + this.vocab[value] = key; + } + } - --end; - } - if (currentSubstring === null) { - isUnknown = true; - break; - } - subTokens.push(currentSubstring); - start = end; - } - if (isUnknown) { - outputTokens.push(this.unk_token); - } else { - outputTokens.push(...subTokens); - } + /** + * Encodes an array of tokens using WordPiece encoding. + * @param {string[]} tokens The tokens to encode. + * @returns {string[]} An array of encoded tokens. + */ + encode(tokens) { + const outputTokens = []; + for (const token of tokens) { + const chars = [...token]; + if (chars.length > this.max_input_chars_per_word) { + outputTokens.push(this.unk_token); + continue; + } + + let isUnknown = false; + let start = 0; + const subTokens = []; + + while (start < chars.length) { + let end = chars.length; + let currentSubstring = null; + while (start < end) { + let substr = chars.slice(start, end).join(""); + + if (start > 0) { + substr = this.config.continuing_subword_prefix + substr; + } + if (this.tokens_to_ids.has(substr)) { + currentSubstring = substr; + break; + } + + --end; } - - return outputTokens; + if (currentSubstring === null) { + isUnknown = true; + break; + } + subTokens.push(currentSubstring); + start = end; + } + if (isUnknown) { + outputTokens.push(this.unk_token); + } else { + outputTokens.push(...subTokens); + } } + return outputTokens; + } } /** @@ -465,106 +470,111 @@ class WordPieceTokenizer extends TokenizerModel { * @extends TokenizerModel */ class Unigram extends TokenizerModel { - /** - * Create a new Unigram tokenizer model. - * @param {Object} config The configuration object for the Unigram model. - * @param {number} config.unk_id The ID of the unknown token - * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores. - * @param {Object} moreConfig Additional configuration object for the Unigram model. - */ - constructor(config, moreConfig) { - super(config); + /** + * Create a new Unigram tokenizer model. + * @param {Object} config The configuration object for the Unigram model. + * @param {number} config.unk_id The ID of the unknown token + * @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores. + * @param {Object} moreConfig Additional configuration object for the Unigram model. + */ + constructor(config, moreConfig) { + super(config); - const vocabSize = config.vocab.length; - this.vocab = new Array(vocabSize); - this.scores = new Array(vocabSize); - for (let i = 0; i < vocabSize; ++i) { - const piece = config.vocab[i]; - this.vocab[i] = piece[0]; - this.scores[i] = piece[1]; + const vocabSize = config.vocab.length; + this.vocab = new Array(vocabSize); + this.scores = new Array(vocabSize); + for (let i = 0; i < vocabSize; ++i) { + const piece = config.vocab[i]; + this.vocab[i] = piece[0]; + this.scores[i] = piece[1]; + } + + this.unk_token_id = config.unk_id; + this.unk_token = this.vocab[config.unk_id]; + + this.tokens_to_ids = new Map(this.vocab.map((x, i) => [x, i])); + this.bosToken = " "; // beginning of a sentence token + + this.bosTokenId = this.tokens_to_ids.get(this.bosToken); // NOTE: may be undefined + this.eosToken = moreConfig.eos_token; + + this.eosTokenId = this.tokens_to_ids.get(this.eosToken); + this.unkToken = this.vocab[this.unk_token_id]; + + this.minScore = min(this.scores)[0]; + + this.unkScore = this.minScore - 10.0; + this.scores[this.unk_token_id] = this.unkScore; + + this.trie = new CharTrie(); + this.trie.extend(this.vocab); + + // NOTE: `fuse_unk` is hardcoded to true for Unigram models + // See: https://github.com/huggingface/tokenizers/blob/b58227c7f1ccf8b73ee2268354336da56d91e492/tokenizers/src/models/unigram/model.rs#L119 + this.fuse_unk = true; + } + + /** + * Populates lattice nodes. + * @param {TokenLattice} lattice The token lattice to populate with nodes. + */ + populateNodes(lattice) { + const sentence = lattice.sentence; + const len = sentence.length; + let beginPos = 0; + while (beginPos < len) { + const mblen = 1; + let hasSingleNode = false; + const tokens = []; + + for (let token of this.trie.commonPrefixSearch( + sentence.slice(beginPos), + )) { + tokens.push(token); + const tokenId = this.tokens_to_ids.get(token); + const tokenScore = this.scores[tokenId]; + const n = token.length; + lattice.insert(beginPos, n, tokenScore, tokenId); + if (!hasSingleNode && n === mblen) { + hasSingleNode = true; } - - this.unk_token_id = config.unk_id; - this.unk_token = this.vocab[config.unk_id]; - - this.tokens_to_ids = new Map(this.vocab.map((x, i) => [x, i])); - this.bosToken = ' '; // beginning of a sentence token - - this.bosTokenId = this.tokens_to_ids.get(this.bosToken); // NOTE: may be undefined - this.eosToken = moreConfig.eos_token; - - this.eosTokenId = this.tokens_to_ids.get(this.eosToken); - this.unkToken = this.vocab[this.unk_token_id]; - - this.minScore = min(this.scores)[0]; - - this.unkScore = this.minScore - 10.0; - this.scores[this.unk_token_id] = this.unkScore; - - this.trie = new CharTrie(); - this.trie.extend(this.vocab); - - // NOTE: `fuse_unk` is hardcoded to true for Unigram models - // See: https://github.com/huggingface/tokenizers/blob/b58227c7f1ccf8b73ee2268354336da56d91e492/tokenizers/src/models/unigram/model.rs#L119 - this.fuse_unk = true; + } + if (!hasSingleNode) { + lattice.insert(beginPos, mblen, this.unkScore, this.unk_token_id); + } + beginPos += mblen; } + } - /** - * Populates lattice nodes. - * @param {TokenLattice} lattice The token lattice to populate with nodes. - */ - populateNodes(lattice) { - const sentence = lattice.sentence; - const len = sentence.length; - let beginPos = 0; - while (beginPos < len) { - const mblen = 1; - let hasSingleNode = false; - const tokens = []; + /** + * Encodes an array of tokens into an array of subtokens using the unigram model. + * + * @param {string} normalized The normalized string. + * @returns {string[]} An array of subtokens obtained by encoding the input tokens using the unigram model. + */ + tokenize(normalized) { + const lattice = new TokenLattice( + normalized, + this.bosTokenId, + this.eosTokenId, + ); + this.populateNodes(lattice); + return lattice.tokens(); + } - for (let token of this.trie.commonPrefixSearch(sentence.slice(beginPos))) { - tokens.push(token); - const tokenId = this.tokens_to_ids.get(token); - const tokenScore = this.scores[tokenId]; - const n = token.length; - lattice.insert(beginPos, n, tokenScore, tokenId); - if (!hasSingleNode && n === mblen) { - hasSingleNode = true; - } - } - if (!hasSingleNode) { - lattice.insert(beginPos, mblen, this.unkScore, this.unk_token_id); - } - beginPos += mblen; - } + /** + * Encodes an array of tokens using Unigram encoding. + * @param {string[]} tokens The tokens to encode. + * @returns {string[]} An array of encoded tokens. + */ + encode(tokens) { + const toReturn = []; + for (const token of tokens) { + const tokenized = this.tokenize(token); + toReturn.push(...tokenized); } - - /** - * Encodes an array of tokens into an array of subtokens using the unigram model. - * - * @param {string} normalized The normalized string. - * @returns {string[]} An array of subtokens obtained by encoding the input tokens using the unigram model. - */ - tokenize(normalized) { - const lattice = new TokenLattice(normalized, this.bosTokenId, this.eosTokenId); - this.populateNodes(lattice); - return lattice.tokens(); - } - - /** - * Encodes an array of tokens using Unigram encoding. - * @param {string[]} tokens The tokens to encode. - * @returns {string[]} An array of encoded tokens. - */ - encode(tokens) { - const toReturn = []; - for (const token of tokens) { - const tokenized = this.tokenize(token); - toReturn.push(...tokenized); - } - return toReturn; - } - + return toReturn; + } } /** @@ -573,31 +583,39 @@ class Unigram extends TokenizerModel { * @returns {Object} Object with utf-8 byte keys and unicode string values. */ const BYTES_TO_UNICODE = (() => { - // Returns list of utf-8 byte and a mapping to unicode strings. - // We specifically avoids mapping to whitespace/control characters - // the bpe code barfs on. + // Returns list of utf-8 byte and a mapping to unicode strings. + // We specifically avoids mapping to whitespace/control characters + // the bpe code barfs on. - const bs = [ - ...Array.from({ length: "~".charCodeAt(0) - "!".charCodeAt(0) + 1 }, (_, i) => i + "!".charCodeAt(0)), - ...Array.from({ length: "ยฌ".charCodeAt(0) - "ยก".charCodeAt(0) + 1 }, (_, i) => i + "ยก".charCodeAt(0)), - ...Array.from({ length: "รฟ".charCodeAt(0) - "ยฎ".charCodeAt(0) + 1 }, (_, i) => i + "ยฎ".charCodeAt(0)), - ]; - const cs = bs.slice(); - let n = 0; - for (let b = 0; b < 256; ++b) { - if (!bs.includes(b)) { - bs.push(b); - cs.push(256 + n); - n += 1; - } + const bs = [ + ...Array.from( + { length: "~".charCodeAt(0) - "!".charCodeAt(0) + 1 }, + (_, i) => i + "!".charCodeAt(0), + ), + ...Array.from( + { length: "ยฌ".charCodeAt(0) - "ยก".charCodeAt(0) + 1 }, + (_, i) => i + "ยก".charCodeAt(0), + ), + ...Array.from( + { length: "รฟ".charCodeAt(0) - "ยฎ".charCodeAt(0) + 1 }, + (_, i) => i + "ยฎ".charCodeAt(0), + ), + ]; + const cs = bs.slice(); + let n = 0; + for (let b = 0; b < 256; ++b) { + if (!bs.includes(b)) { + bs.push(b); + cs.push(256 + n); + n += 1; } - const ccs = cs.map(n => String.fromCharCode(n)); - return Object.fromEntries(bs.map((b, i) => [b, ccs[i]])); + } + const ccs = cs.map((n) => String.fromCharCode(n)); + return Object.fromEntries(bs.map((b, i) => [b, ccs[i]])); })(); const UNICODE_TO_BYTES = reverseDictionary(BYTES_TO_UNICODE); - /** * @typedef {Object} BPENode * @property {string} token The token associated with the node @@ -612,343 +630,345 @@ const UNICODE_TO_BYTES = reverseDictionary(BYTES_TO_UNICODE); * @extends TokenizerModel */ class BPE extends TokenizerModel { - /** - * Create a BPE instance. - * @param {Object} config The configuration object for BPE. - * @param {Object} config.vocab A mapping of tokens to ids. - * @param {string} config.unk_token The unknown token used for out of vocabulary words. - * @param {string} config.end_of_word_suffix The suffix to place at the end of each word. - * @param {string} [config.continuing_subword_suffix] The suffix to insert between words. - * @param {Array} config.merges An array of BPE merges as strings. - */ - constructor(config) { - super(config); + /** + * Create a BPE instance. + * @param {Object} config The configuration object for BPE. + * @param {Object} config.vocab A mapping of tokens to ids. + * @param {string} config.unk_token The unknown token used for out of vocabulary words. + * @param {string} config.end_of_word_suffix The suffix to place at the end of each word. + * @param {string} [config.continuing_subword_suffix] The suffix to insert between words. + * @param {Array} config.merges An array of BPE merges as strings. + */ + constructor(config) { + super(config); - this.BPE_SPLIT_TOKEN = ' '; + this.BPE_SPLIT_TOKEN = " "; - /** @type {Map} */ - this.tokens_to_ids = objectToMap(config.vocab); + /** @type {Map} */ + this.tokens_to_ids = objectToMap(config.vocab); - this.unk_token_id = this.tokens_to_ids.get(config.unk_token); - this.unk_token = config.unk_token; + this.unk_token_id = this.tokens_to_ids.get(config.unk_token); + this.unk_token = config.unk_token; - this.vocab = new Array(this.tokens_to_ids.size); - for (const [key, value] of this.tokens_to_ids) { - this.vocab[value] = key; - } - - this.bpe_ranks = new Map(config.merges.map((x, i) => [x, i])); - this.merges = config.merges.map(x => x.split(this.BPE_SPLIT_TOKEN)); - - this.end_of_word_suffix = config.end_of_word_suffix; - - // NOTE: `continuing_subword_suffix` is custom (to support `BlenderbotSmallTokenizer`) - this.continuing_subword_suffix = config.continuing_subword_suffix ?? null; - - this.byte_fallback = this.config.byte_fallback ?? false; - - if (this.byte_fallback) { - this.text_encoder = new TextEncoder(); - } - - /** @type {Map} */ - this.cache = new Map(); + this.vocab = new Array(this.tokens_to_ids.size); + for (const [key, value] of this.tokens_to_ids) { + this.vocab[value] = key; } - /** - * Apply Byte-Pair-Encoding (BPE) to a given token. Efficient heap-based priority - * queue implementation adapted from https://github.com/belladoreai/llama-tokenizer-js. - * @param {string} token The token to encode. - * @returns {string[]} The BPE encoded tokens. - */ - bpe(token) { - if (token.length === 0) { - return []; + this.bpe_ranks = new Map(config.merges.map((x, i) => [x, i])); + this.merges = config.merges.map((x) => x.split(this.BPE_SPLIT_TOKEN)); + + this.end_of_word_suffix = config.end_of_word_suffix; + + // NOTE: `continuing_subword_suffix` is custom (to support `BlenderbotSmallTokenizer`) + this.continuing_subword_suffix = config.continuing_subword_suffix ?? null; + + this.byte_fallback = this.config.byte_fallback ?? false; + + if (this.byte_fallback) { + this.text_encoder = new TextEncoder(); + } + + /** @type {Map} */ + this.cache = new Map(); + } + + /** + * Apply Byte-Pair-Encoding (BPE) to a given token. Efficient heap-based priority + * queue implementation adapted from https://github.com/belladoreai/llama-tokenizer-js. + * @param {string} token The token to encode. + * @returns {string[]} The BPE encoded tokens. + */ + bpe(token) { + if (token.length === 0) { + return []; + } + + const cached = this.cache.get(token); + if (cached !== undefined) { + return cached; + } + + const word = Array.from(token); + if (this.end_of_word_suffix) { + word[word.length - 1] += this.end_of_word_suffix; + } + + let result = []; + if (word.length > 1) { + // Create a priority queue to store the nodes that will be merged. + // The comparator function compares the scores of the nodes. + const queue = new PriorityQueue((a, b) => a.score < b.score); + + // Construct a doubly-linked list of nodes that will be inserted into the priority queue, + // starting with the individual characters. We also populate each node with a positional + // bias to break ties in the priority queue. + let startingNode = { + token: word[0], + bias: 0, + prev: null, + next: null, + }; + + let previousNode = startingNode; + for (let i = 1; i < word.length; ++i) { + const currentNode = { + bias: i / word.length, // Add fractional component to break ties + token: word[i], + prev: previousNode, + next: null, + }; + previousNode.next = currentNode; + this._add_node(queue, previousNode); + previousNode = currentNode; + } + + while (!queue.isEmpty()) { + // Get the next node with the highest priority + const node = queue.pop(); + + // Check that this merge is still possible + if (node.deleted || !node.next || node.next.deleted) continue; + + // Here, we mark the current node (left side of the merge) and the next node (right side of the merge) as deleted. + // This is because they will both be replaced by a new node representing the merge result. + node.deleted = true; + node.next.deleted = true; + + // Next, we fix the node that comes before the current node (i.e., left side of the merge). + if (node.prev) { + // Make a shallow copy of the previous node + const newPreviousNode = { ...node.prev }; + + // Mark the old previous node as deleted. This avoids erroneous merges later, + // because there may still be references to this node in the priority queue. + node.prev.deleted = true; + node.prev = newPreviousNode; + + // Update the reference of the previous node, by pointing its previous node to this new previous node. + if (newPreviousNode.prev) { + newPreviousNode.prev.next = newPreviousNode; + } else { + // If the previous of the previous node does not exist, it means that + // `newPreviousNode` must be the new `startingNode`. + startingNode = newPreviousNode; + } } - const cached = this.cache.get(token); - if (cached !== undefined) { - return cached; - } + // Create a new node which represents the result of the merge. + const merged = { + token: node.token + node.next.token, + bias: node.bias, + prev: node.prev, + next: node.next.next, + }; - const word = Array.from(token); - if (this.end_of_word_suffix) { - word[word.length - 1] += this.end_of_word_suffix; - } - - let result = []; - if (word.length > 1) { - // Create a priority queue to store the nodes that will be merged. - // The comparator function compares the scores of the nodes. - const queue = new PriorityQueue((a, b) => a.score < b.score); - - // Construct a doubly-linked list of nodes that will be inserted into the priority queue, - // starting with the individual characters. We also populate each node with a positional - // bias to break ties in the priority queue. - let startingNode = { - token: word[0], - bias: 0, - prev: null, - next: null, - } - - let previousNode = startingNode - for (let i = 1; i < word.length; ++i) { - const currentNode = { - bias: i / word.length, // Add fractional component to break ties - token: word[i], - prev: previousNode, - next: null, - } - previousNode.next = currentNode - this._add_node(queue, previousNode) - previousNode = currentNode - } - - while (!queue.isEmpty()) { - // Get the next node with the highest priority - const node = queue.pop(); - - // Check that this merge is still possible - if (node.deleted || !node.next || node.next.deleted) continue; - - // Here, we mark the current node (left side of the merge) and the next node (right side of the merge) as deleted. - // This is because they will both be replaced by a new node representing the merge result. - node.deleted = true; - node.next.deleted = true; - - // Next, we fix the node that comes before the current node (i.e., left side of the merge). - if (node.prev) { - - // Make a shallow copy of the previous node - const newPreviousNode = { ...node.prev }; - - // Mark the old previous node as deleted. This avoids erroneous merges later, - // because there may still be references to this node in the priority queue. - node.prev.deleted = true; - node.prev = newPreviousNode; - - // Update the reference of the previous node, by pointing its previous node to this new previous node. - if (newPreviousNode.prev) { - newPreviousNode.prev.next = newPreviousNode; - } else { - // If the previous of the previous node does not exist, it means that - // `newPreviousNode` must be the new `startingNode`. - startingNode = newPreviousNode; - } - } - - // Create a new node which represents the result of the merge. - const merged = { - token: node.token + node.next.token, - bias: node.bias, - prev: node.prev, - next: node.next.next, - } - - // We now consider where we can add the new merged node to the priority queue: - // 1. prev <-> merged - if (merged.prev) { - merged.prev.next = merged; - this._add_node(queue, merged.prev); - } else { - // If `merged.prev` does not exist, then `merged` must be the new `startingNode`. - startingNode = merged; - } - - // 2. merged <-> next - if (merged.next) { - merged.next.prev = merged; - this._add_node(queue, merged); - } - } - - // Traverse the linked list, starting from the `startingNode`, and collect the tokens. - for (let currentNode = startingNode; currentNode !== null; currentNode = currentNode.next) { - result.push(currentNode.token); - } + // We now consider where we can add the new merged node to the priority queue: + // 1. prev <-> merged + if (merged.prev) { + merged.prev.next = merged; + this._add_node(queue, merged.prev); } else { - result = word; + // If `merged.prev` does not exist, then `merged` must be the new `startingNode`. + startingNode = merged; } - // Possibly append suffix - if (this.continuing_subword_suffix) { - // Do not append suffix to the last token - for (let i = 0; i < result.length - 1; ++i) { - result[i] += this.continuing_subword_suffix; - } + // 2. merged <-> next + if (merged.next) { + merged.next.prev = merged; + this._add_node(queue, merged); } + } - // Save the result to the cache - this.cache.set(token, result); - - return result; + // Traverse the linked list, starting from the `startingNode`, and collect the tokens. + for ( + let currentNode = startingNode; + currentNode !== null; + currentNode = currentNode.next + ) { + result.push(currentNode.token); + } + } else { + result = word; } - - /** - * Helper function to add a node to the priority queue. - * @param {PriorityQueue} queue - * @param {BPENode} node - * @private - */ - _add_node(queue, node) { - // `score` is a measure of the merge priority: lower means higher priority - // We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list) - // We also add a fractional component to the score to break ties (with the earlier character having higher priority) - const rank = this.bpe_ranks.get(node.token + this.BPE_SPLIT_TOKEN + node.next.token); - if (rank !== undefined) { - node.score = rank + node.bias; - queue.push(node); - } + // Possibly append suffix + if (this.continuing_subword_suffix) { + // Do not append suffix to the last token + for (let i = 0; i < result.length - 1; ++i) { + result[i] += this.continuing_subword_suffix; + } } - /** - * Encodes the input sequence of tokens using the BPE algorithm and returns the resulting subword tokens. - * @param {string[]} tokens The input sequence of tokens to encode. - * @returns {string[]} The resulting subword tokens after applying the BPE algorithm to the input sequence of tokens. - */ - encode(tokens) { - const outputTokens = []; + // Save the result to the cache + this.cache.set(token, result); - for (const token of tokens) { - const bpe_token_list = this.bpe(token); + return result; + } - for (const t of bpe_token_list) { - if (this.tokens_to_ids.has(t)) { - outputTokens.push(t); - } else { - if (this.byte_fallback) { - outputTokens.push( - ...Array.from(this.text_encoder.encode(t)) - .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`) - ); - } else { - outputTokens.push(this.unk_token); - } - } - } + /** + * Helper function to add a node to the priority queue. + * @param {PriorityQueue} queue + * @param {BPENode} node + * @private + */ + _add_node(queue, node) { + // `score` is a measure of the merge priority: lower means higher priority + // We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list) + // We also add a fractional component to the score to break ties (with the earlier character having higher priority) + const rank = this.bpe_ranks.get( + node.token + this.BPE_SPLIT_TOKEN + node.next.token, + ); + if (rank !== undefined) { + node.score = rank + node.bias; + queue.push(node); + } + } + + /** + * Encodes the input sequence of tokens using the BPE algorithm and returns the resulting subword tokens. + * @param {string[]} tokens The input sequence of tokens to encode. + * @returns {string[]} The resulting subword tokens after applying the BPE algorithm to the input sequence of tokens. + */ + encode(tokens) { + const outputTokens = []; + + for (const token of tokens) { + const bpe_token_list = this.bpe(token); + + for (const t of bpe_token_list) { + if (this.tokens_to_ids.has(t)) { + outputTokens.push(t); + } else { + if (this.byte_fallback) { + outputTokens.push( + ...Array.from(this.text_encoder.encode(t)).map( + (x) => `<0x${x.toString(16).toUpperCase().padStart(2, "0")}>`, + ), + ); + } else { + outputTokens.push(this.unk_token); + } } - - return outputTokens; + } } + return outputTokens; + } } /** * Legacy tokenizer class for tokenizers with only a vocabulary. */ class LegacyTokenizerModel extends TokenizerModel { - /** - * Create a LegacyTokenizerModel instance. - * @param {Object} config The configuration object for LegacyTokenizerModel. - * @param {Object} config.vocab A (possibly nested) mapping of tokens to ids. - * @param {Object} moreConfig Additional configuration object for the LegacyTokenizerModel model. - */ - constructor(config, moreConfig) { - super(config); + /** + * Create a LegacyTokenizerModel instance. + * @param {Object} config The configuration object for LegacyTokenizerModel. + * @param {Object} config.vocab A (possibly nested) mapping of tokens to ids. + * @param {Object} moreConfig Additional configuration object for the LegacyTokenizerModel model. + */ + constructor(config, moreConfig) { + super(config); - /**@type {Map} */ - this.tokens_to_ids = objectToMap( - moreConfig.target_lang - ? config.vocab[moreConfig.target_lang] - : config.vocab - ); + /**@type {Map} */ + this.tokens_to_ids = objectToMap( + moreConfig.target_lang + ? config.vocab[moreConfig.target_lang] + : config.vocab, + ); - this.bos_token = moreConfig.bos_token; - this.bos_token_id = this.tokens_to_ids.get(this.bos_token); + this.bos_token = moreConfig.bos_token; + this.bos_token_id = this.tokens_to_ids.get(this.bos_token); - this.eos_token = moreConfig.eos_token; - this.eos_token_id = this.tokens_to_ids.get(this.eos_token); + this.eos_token = moreConfig.eos_token; + this.eos_token_id = this.tokens_to_ids.get(this.eos_token); - this.pad_token = moreConfig.pad_token; - this.pad_token_id = this.tokens_to_ids.get(this.pad_token); + this.pad_token = moreConfig.pad_token; + this.pad_token_id = this.tokens_to_ids.get(this.pad_token); - this.unk_token = moreConfig.unk_token; - this.unk_token_id = this.tokens_to_ids.get(this.unk_token); + this.unk_token = moreConfig.unk_token; + this.unk_token_id = this.tokens_to_ids.get(this.unk_token); - this.vocab = new Array(this.tokens_to_ids.size); - for (const [key, value] of this.tokens_to_ids) { - this.vocab[value] = key; - } + this.vocab = new Array(this.tokens_to_ids.size); + for (const [key, value] of this.tokens_to_ids) { + this.vocab[value] = key; } + } - encode(tokens) { - return tokens; - } + encode(tokens) { + return tokens; + } } - /** * A base class for text normalization. * @abstract */ class Normalizer extends Callable { - /** - * @param {Object} config The configuration object for the normalizer. - */ - constructor(config) { - super(); - this.config = config; - } + /** + * @param {Object} config The configuration object for the normalizer. + */ + constructor(config) { + super(); + this.config = config; + } - /** - * Factory method for creating normalizers from config objects. - * @static - * @param {Object} config The configuration object for the normalizer. - * @returns {Normalizer} A Normalizer object. - * @throws {Error} If an unknown Normalizer type is specified in the config. - */ - static fromConfig(config) { - if (config === null) return null; - switch (config.type) { - case 'BertNormalizer': - return new BertNormalizer(config); - case 'Precompiled': - return new Precompiled(config); - case 'Sequence': - return new NormalizerSequence(config); - case 'Replace': - return new Replace(config); - case 'NFC': - return new NFC(config); - case 'NFKC': - return new NFKC(config); - case 'NFKD': - return new NFKD(config); - case 'Strip': - return new StripNormalizer(config); - case 'StripAccents': - return new StripAccents(config); - case 'Lowercase': - return new Lowercase(config); - case 'Prepend': - return new Prepend(config); - default: - throw new Error(`Unknown Normalizer type: ${config.type}`); - } + /** + * Factory method for creating normalizers from config objects. + * @static + * @param {Object} config The configuration object for the normalizer. + * @returns {Normalizer} A Normalizer object. + * @throws {Error} If an unknown Normalizer type is specified in the config. + */ + static fromConfig(config) { + if (config === null) return null; + switch (config.type) { + case "BertNormalizer": + return new BertNormalizer(config); + case "Precompiled": + return new Precompiled(config); + case "Sequence": + return new NormalizerSequence(config); + case "Replace": + return new Replace(config); + case "NFC": + return new NFC(config); + case "NFKC": + return new NFKC(config); + case "NFKD": + return new NFKD(config); + case "Strip": + return new StripNormalizer(config); + case "StripAccents": + return new StripAccents(config); + case "Lowercase": + return new Lowercase(config); + case "Prepend": + return new Prepend(config); + default: + throw new Error(`Unknown Normalizer type: ${config.type}`); } + } - /** - * Normalize the input text. - * @abstract - * @param {string} text The text to normalize. - * @returns {string} The normalized text. - * @throws {Error} If this method is not implemented in a subclass. - */ - normalize(text) { - throw Error("normalize should be implemented in subclass.") - } - - /** - * Alias for {@link Normalizer#normalize}. - * @param {string} text The text to normalize. - * @returns {string} The normalized text. - */ - _call(text) { - return this.normalize(text); - } + /** + * Normalize the input text. + * @abstract + * @param {string} text The text to normalize. + * @returns {string} The normalized text. + * @throws {Error} If this method is not implemented in a subclass. + */ + normalize(text) { + throw Error("normalize should be implemented in subclass."); + } + /** + * Alias for {@link Normalizer#normalize}. + * @param {string} text The text to normalize. + * @returns {string} The normalized text. + */ + _call(text) { + return this.normalize(text); + } } /** @@ -956,17 +976,17 @@ class Normalizer extends Callable { * @extends Normalizer */ class Replace extends Normalizer { - /** - * Normalize the input text by replacing the pattern with the content. - * @param {string} text The input text to be normalized. - * @returns {string} The normalized text after replacing the pattern with the content. - */ - normalize(text) { - const pattern = createPattern(this.config.pattern); - return pattern === null - ? text - : text.replaceAll(pattern, this.config.content); - } + /** + * Normalize the input text by replacing the pattern with the content. + * @param {string} text The input text to be normalized. + * @returns {string} The normalized text after replacing the pattern with the content. + */ + normalize(text) { + const pattern = createPattern(this.config.pattern); + return pattern === null + ? text + : text.replaceAll(pattern, this.config.content); + } } /** @@ -974,15 +994,15 @@ class Replace extends Normalizer { * @extends Normalizer */ class NFC extends Normalizer { - /** - * Normalize the input text by applying Unicode normalization form C (NFC). - * @param {string} text The input text to be normalized. - * @returns {string} The normalized text. - */ - normalize(text) { - text = text.normalize('NFC') - return text; - } + /** + * Normalize the input text by applying Unicode normalization form C (NFC). + * @param {string} text The input text to be normalized. + * @returns {string} The normalized text. + */ + normalize(text) { + text = text.normalize("NFC"); + return text; + } } /** @@ -990,55 +1010,55 @@ class NFC extends Normalizer { * @extends Normalizer */ class NFKC extends Normalizer { - /** - * Normalize text using NFKC normalization. - * @param {string} text The text to be normalized. - * @returns {string} The normalized text. - */ - normalize(text) { - text = text.normalize('NFKC') - return text; - } + /** + * Normalize text using NFKC normalization. + * @param {string} text The text to be normalized. + * @returns {string} The normalized text. + */ + normalize(text) { + text = text.normalize("NFKC"); + return text; + } } /** * NFKD Normalizer. * @extends Normalizer */ class NFKD extends Normalizer { - /** - * Normalize text using NFKD normalization. - * @param {string} text The text to be normalized. - * @returns {string} The normalized text. - */ - normalize(text) { - text = text.normalize('NFKD') - return text; - } + /** + * Normalize text using NFKD normalization. + * @param {string} text The text to be normalized. + * @returns {string} The normalized text. + */ + normalize(text) { + text = text.normalize("NFKD"); + return text; + } } /** * A normalizer that strips leading and/or trailing whitespace from the input text. */ class StripNormalizer extends Normalizer { - /** - * Strip leading and/or trailing whitespace from the input text. - * @param {string} text The input text. - * @returns {string} The normalized text. - */ - normalize(text) { - if (this.config.strip_left && this.config.strip_right) { - // Fast path to avoid an extra trim call - text = text.trim(); - } else { - if (this.config.strip_left) { - text = text.trimStart(); - } - if (this.config.strip_right) { - text = text.trimEnd(); - } - } - return text; + /** + * Strip leading and/or trailing whitespace from the input text. + * @param {string} text The input text. + * @returns {string} The normalized text. + */ + normalize(text) { + if (this.config.strip_left && this.config.strip_right) { + // Fast path to avoid an extra trim call + text = text.trim(); + } else { + if (this.config.strip_left) { + text = text.trimStart(); + } + if (this.config.strip_right) { + text = text.trimEnd(); + } } + return text; + } } /** @@ -1046,15 +1066,15 @@ class StripNormalizer extends Normalizer { * @extends Normalizer */ class StripAccents extends Normalizer { - /** - * Remove all accents from the text. - * @param {string} text The input text. - * @returns {string} The normalized text without accents. - */ - normalize(text) { - text = remove_accents(text); - return text; - } + /** + * Remove all accents from the text. + * @param {string} text The input text. + * @returns {string} The normalized text without accents. + */ + normalize(text) { + text = remove_accents(text); + return text; + } } /** @@ -1062,15 +1082,15 @@ class StripAccents extends Normalizer { * @extends Normalizer */ class Lowercase extends Normalizer { - /** - * Lowercases the input string. - * @param {string} text The text to normalize. - * @returns {string} The normalized text. - */ - normalize(text) { - text = text.toLowerCase(); - return text; - } + /** + * Lowercases the input string. + * @param {string} text The text to normalize. + * @returns {string} The normalized text. + */ + normalize(text) { + text = text.toLowerCase(); + return text; + } } /** @@ -1078,15 +1098,15 @@ class Lowercase extends Normalizer { * @extends Normalizer */ class Prepend extends Normalizer { - /** - * Prepends the input string. - * @param {string} text The text to normalize. - * @returns {string} The normalized text. - */ - normalize(text) { - text = this.config.prepend + text; - return text; - } + /** + * Prepends the input string. + * @param {string} text The text to normalize. + * @returns {string} The normalized text. + */ + normalize(text) { + text = this.config.prepend + text; + return text; + } } /** @@ -1094,25 +1114,25 @@ class Prepend extends Normalizer { * @extends Normalizer */ class NormalizerSequence extends Normalizer { - /** + /** * Create a new instance of NormalizerSequence. * @param {Object} config The configuration object. * @param {Object[]} config.normalizers An array of Normalizer configuration objects. */ - constructor(config) { - super(config); - this.normalizers = config.normalizers.map(x => Normalizer.fromConfig(x)); - } - /** - * Apply a sequence of Normalizers to the input text. - * @param {string} text The text to normalize. - * @returns {string} The normalized text. - */ - normalize(text) { - return this.normalizers.reduce((t, normalizer) => { - return normalizer.normalize(t); - }, text); - } + constructor(config) { + super(config); + this.normalizers = config.normalizers.map((x) => Normalizer.fromConfig(x)); + } + /** + * Apply a sequence of Normalizers to the input text. + * @param {string} text The text to normalize. + * @returns {string} The normalized text. + */ + normalize(text) { + return this.normalizers.reduce((t, normalizer) => { + return normalizer.normalize(t); + }, text); + } } /** @@ -1120,136 +1140,136 @@ class NormalizerSequence extends Normalizer { * @extends Normalizer */ class BertNormalizer extends Normalizer { - /** - * Adds whitespace around any CJK (Chinese, Japanese, or Korean) character in the input text. - * - * @param {string} text The input text to tokenize. - * @returns {string} The tokenized text with whitespace added around CJK characters. - */ - _tokenize_chinese_chars(text) { - /* Adds whitespace around any CJK character. */ - const output = []; - for (let i = 0; i < text.length; ++i) { - const char = text[i]; - const cp = char.charCodeAt(0); - if (this._is_chinese_char(cp)) { - output.push(" "); - output.push(char); - output.push(" "); - } else { - output.push(char); - } - } - return output.join(""); + /** + * Adds whitespace around any CJK (Chinese, Japanese, or Korean) character in the input text. + * + * @param {string} text The input text to tokenize. + * @returns {string} The tokenized text with whitespace added around CJK characters. + */ + _tokenize_chinese_chars(text) { + /* Adds whitespace around any CJK character. */ + const output = []; + for (let i = 0; i < text.length; ++i) { + const char = text[i]; + const cp = char.charCodeAt(0); + if (this._is_chinese_char(cp)) { + output.push(" "); + output.push(char); + output.push(" "); + } else { + output.push(char); + } + } + return output.join(""); + } + + /** + * Checks whether the given Unicode codepoint represents a CJK (Chinese, Japanese, or Korean) character. + * + * A "chinese character" is defined as anything in the CJK Unicode block: + * https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + * + * Note that the CJK Unicode block is NOT all Japanese and Korean characters, despite its name. + * The modern Korean Hangul alphabet is a different block, as is Japanese Hiragana and Katakana. + * Those alphabets are used to write space-separated words, so they are not treated specially + * and are handled like all other languages. + * + * @param {number} cp The Unicode codepoint to check. + * @returns {boolean} True if the codepoint represents a CJK character, false otherwise. + */ + _is_chinese_char(cp) { + return ( + (cp >= 0x4e00 && cp <= 0x9fff) || + (cp >= 0x3400 && cp <= 0x4dbf) || + (cp >= 0x20000 && cp <= 0x2a6df) || + (cp >= 0x2a700 && cp <= 0x2b73f) || + (cp >= 0x2b740 && cp <= 0x2b81f) || + (cp >= 0x2b820 && cp <= 0x2ceaf) || + (cp >= 0xf900 && cp <= 0xfaff) || + (cp >= 0x2f800 && cp <= 0x2fa1f) + ); + } + /** + * Strips accents from the given text. + * @param {string} text The text to strip accents from. + * @returns {string} The text with accents removed. + */ + stripAccents(text) { + return text.normalize("NFD").replace(/[\u0300-\u036f]/g, ""); + } + + /** + * Checks whether `char` is a control character. + * @param {string} char The character to check. + * @returns {boolean} Whether `char` is a control character. + * @private + */ + _is_control(char) { + switch (char) { + case "\t": + case "\n": + case "\r": + // These are technically control characters but we count them as whitespace characters. + return false; + + default: + // Check if unicode category starts with C: + // Cc - Control + // Cf - Format + // Co - Private Use + // Cs - Surrogate + return /^\p{Cc}|\p{Cf}|\p{Co}|\p{Cs}$/u.test(char); + } + } + + /** + * Performs invalid character removal and whitespace cleanup on text. + * @param {string} text The text to clean. + * @returns {string} The cleaned text. + * @private + */ + _clean_text(text) { + const output = []; + for (const char of text) { + const cp = char.charCodeAt(0); + if (cp === 0 || cp === 0xfffd || this._is_control(char)) { + continue; + } + if (/^\s$/.test(char)) { + // is whitespace + output.push(" "); + } else { + output.push(char); + } + } + return output.join(""); + } + /** + * Normalizes the given text based on the configuration. + * @param {string} text The text to normalize. + * @returns {string} The normalized text. + */ + normalize(text) { + if (this.config.clean_text) { + text = this._clean_text(text); } - /** - * Checks whether the given Unicode codepoint represents a CJK (Chinese, Japanese, or Korean) character. - * - * A "chinese character" is defined as anything in the CJK Unicode block: - * https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - * - * Note that the CJK Unicode block is NOT all Japanese and Korean characters, despite its name. - * The modern Korean Hangul alphabet is a different block, as is Japanese Hiragana and Katakana. - * Those alphabets are used to write space-separated words, so they are not treated specially - * and are handled like all other languages. - * - * @param {number} cp The Unicode codepoint to check. - * @returns {boolean} True if the codepoint represents a CJK character, false otherwise. - */ - _is_chinese_char(cp) { - return ( - (cp >= 0x4E00 && cp <= 0x9FFF) - || (cp >= 0x3400 && cp <= 0x4DBF) - || (cp >= 0x20000 && cp <= 0x2A6DF) - || (cp >= 0x2A700 && cp <= 0x2B73F) - || (cp >= 0x2B740 && cp <= 0x2B81F) - || (cp >= 0x2B820 && cp <= 0x2CEAF) - || (cp >= 0xF900 && cp <= 0xFAFF) - || (cp >= 0x2F800 && cp <= 0x2FA1F) - ) - } - /** - * Strips accents from the given text. - * @param {string} text The text to strip accents from. - * @returns {string} The text with accents removed. - */ - stripAccents(text) { - return text.normalize('NFD').replace(/[\u0300-\u036f]/g, ''); + if (this.config.handle_chinese_chars) { + text = this._tokenize_chinese_chars(text); } + if (this.config.lowercase) { + text = text.toLowerCase(); - /** - * Checks whether `char` is a control character. - * @param {string} char The character to check. - * @returns {boolean} Whether `char` is a control character. - * @private - */ - _is_control(char) { - switch (char) { - case '\t': - case '\n': - case '\r': - // These are technically control characters but we count them as whitespace characters. - return false; - - default: - // Check if unicode category starts with C: - // Cc - Control - // Cf - Format - // Co - Private Use - // Cs - Surrogate - return /^\p{Cc}|\p{Cf}|\p{Co}|\p{Cs}$/u.test(char); - } + if (this.config.strip_accents !== false) { + text = this.stripAccents(text); + } + } else if (this.config.strip_accents) { + text = this.stripAccents(text); } - /** - * Performs invalid character removal and whitespace cleanup on text. - * @param {string} text The text to clean. - * @returns {string} The cleaned text. - * @private - */ - _clean_text(text) { - const output = []; - for (const char of text) { - const cp = char.charCodeAt(0); - if (cp === 0 || cp === 0xFFFD || this._is_control(char)) { - continue; - } - if (/^\s$/.test(char)) { // is whitespace - output.push(" "); - } else { - output.push(char); - } - } - return output.join(""); - } - /** - * Normalizes the given text based on the configuration. - * @param {string} text The text to normalize. - * @returns {string} The normalized text. - */ - normalize(text) { - if (this.config.clean_text) { - text = this._clean_text(text); - } - - if (this.config.handle_chinese_chars) { - text = this._tokenize_chinese_chars(text); - } - - if (this.config.lowercase) { - text = text.toLowerCase(); - - if (this.config.strip_accents !== false) { - text = this.stripAccents(text); - } - } else if (this.config.strip_accents) { - text = this.stripAccents(text); - } - - return text; - } + return text; + } } /** @@ -1258,7 +1278,7 @@ class BertNormalizer extends Normalizer { * @extends Callable */ class PreTokenizer extends Callable { - /** + /** * Factory method that returns an instance of a subclass of `PreTokenizer` based on the provided configuration. * * @static @@ -1266,98 +1286,102 @@ class PreTokenizer extends Callable { * @returns {PreTokenizer} An instance of a subclass of `PreTokenizer`. * @throws {Error} If the provided configuration object does not correspond to any known pre-tokenizer. */ - static fromConfig(config) { - if (config === null) return null; + static fromConfig(config) { + if (config === null) return null; - switch (config.type) { - case 'BertPreTokenizer': - return new BertPreTokenizer(config); - case 'Sequence': - return new PreTokenizerSequence(config); - case 'WhitespaceSplit': - return new WhitespaceSplit(config); - case 'Metaspace': - return new MetaspacePreTokenizer(config); + switch (config.type) { + case "BertPreTokenizer": + return new BertPreTokenizer(config); + case "Sequence": + return new PreTokenizerSequence(config); + case "WhitespaceSplit": + return new WhitespaceSplit(config); + case "Metaspace": + return new MetaspacePreTokenizer(config); - case 'ByteLevel': - return new ByteLevelPreTokenizer(config); - case 'Split': - return new SplitPreTokenizer(config); - case 'Punctuation': - return new PunctuationPreTokenizer(config); - case 'Digits': - return new DigitsPreTokenizer(config); - case 'Replace': - return new ReplacePreTokenizer(config); - default: - throw new Error(`Unknown PreTokenizer type: ${config.type}`); - } + case "ByteLevel": + return new ByteLevelPreTokenizer(config); + case "Split": + return new SplitPreTokenizer(config); + case "Punctuation": + return new PunctuationPreTokenizer(config); + case "Digits": + return new DigitsPreTokenizer(config); + case "Replace": + return new ReplacePreTokenizer(config); + default: + throw new Error(`Unknown PreTokenizer type: ${config.type}`); } + } - /** - * Method that should be implemented by subclasses to define the specific pre-tokenization logic. - * - * @abstract - * @param {string} text The text to pre-tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} The pre-tokenized text. - * @throws {Error} If the method is not implemented in the subclass. - */ - pre_tokenize_text(text, options) { - throw Error("pre_tokenize_text should be implemented in subclass.") - } + /** + * Method that should be implemented by subclasses to define the specific pre-tokenization logic. + * + * @abstract + * @param {string} text The text to pre-tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} The pre-tokenized text. + * @throws {Error} If the method is not implemented in the subclass. + */ + pre_tokenize_text(text, options) { + throw Error("pre_tokenize_text should be implemented in subclass."); + } - /** - * Tokenizes the given text into pre-tokens. - * @param {string|string[]} text The text or array of texts to pre-tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of pre-tokens. - */ - pre_tokenize(text, options) { - return (Array.isArray(text) - ? text.map(x => this.pre_tokenize_text(x, options)) - : this.pre_tokenize_text(text, options) - ).flat(); - } + /** + * Tokenizes the given text into pre-tokens. + * @param {string|string[]} text The text or array of texts to pre-tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of pre-tokens. + */ + pre_tokenize(text, options) { + return ( + Array.isArray(text) + ? text.map((x) => this.pre_tokenize_text(x, options)) + : this.pre_tokenize_text(text, options) + ).flat(); + } - /** - * Alias for {@link PreTokenizer#pre_tokenize}. - * @param {string|string[]} text The text or array of texts to pre-tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of pre-tokens. - */ - _call(text, options) { - return this.pre_tokenize(text, options); - } + /** + * Alias for {@link PreTokenizer#pre_tokenize}. + * @param {string|string[]} text The text or array of texts to pre-tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of pre-tokens. + */ + _call(text, options) { + return this.pre_tokenize(text, options); + } } /** * @extends PreTokenizer */ class BertPreTokenizer extends PreTokenizer { - /** - * A PreTokenizer that splits text into wordpieces using a basic tokenization scheme - * similar to that used in the original implementation of BERT. - * - * @param {Object} config The configuration object. - */ - constructor(config) { - super(); - // Construct a pattern which matches the rust implementation: - // https://github.com/huggingface/tokenizers/blob/b4fcc9ce6e4ad5806e82826f816acfdfdc4fcc67/tokenizers/src/pre_tokenizers/bert.rs#L11 - // Equivalent to removing whitespace and splitting on punctuation (both \p{P} and other ascii characters) - this.pattern = new RegExp(`[^\\s${PUNCTUATION_REGEX}]+|[${PUNCTUATION_REGEX}]`, 'gu'); - } - /** - * Tokenizes a single text using the BERT pre-tokenization scheme. - * - * @param {string} text The text to tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of tokens. - */ - pre_tokenize_text(text, options) { - return text.trim().match(this.pattern) || []; - } + /** + * A PreTokenizer that splits text into wordpieces using a basic tokenization scheme + * similar to that used in the original implementation of BERT. + * + * @param {Object} config The configuration object. + */ + constructor(config) { + super(); + // Construct a pattern which matches the rust implementation: + // https://github.com/huggingface/tokenizers/blob/b4fcc9ce6e4ad5806e82826f816acfdfdc4fcc67/tokenizers/src/pre_tokenizers/bert.rs#L11 + // Equivalent to removing whitespace and splitting on punctuation (both \p{P} and other ascii characters) + this.pattern = new RegExp( + `[^\\s${PUNCTUATION_REGEX}]+|[${PUNCTUATION_REGEX}]`, + "gu", + ); + } + /** + * Tokenizes a single text using the BERT pre-tokenization scheme. + * + * @param {string} text The text to tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of tokens. + */ + pre_tokenize_text(text, options) { + return text.trim().match(this.pattern) || []; + } } /** @@ -1365,58 +1389,62 @@ class BertPreTokenizer extends PreTokenizer { * @extends PreTokenizer */ class ByteLevelPreTokenizer extends PreTokenizer { - /** - * Creates a new instance of the `ByteLevelPreTokenizer` class. - * @param {Object} config The configuration object. - */ - constructor(config) { - super(); - this.config = config; - - /** - * @type {boolean} Whether to add a leading space to the first word. - * This allows to treat the leading word just as any other word. - */ - this.add_prefix_space = this.config.add_prefix_space; - - /** - * @type {boolean} Whether the post processing step should trim offsets - * to avoid including whitespaces. - * @todo Use this in the pretokenization step. - */ - this.trim_offsets = this.config.trim_offsets; - - /** - * @type {boolean} Whether to use the standard GPT2 regex for whitespace splitting. - * Set it to False if you want to use your own splitting. Defaults to true. - */ - this.use_regex = this.config.use_regex ?? true; - this.pattern = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu; - - this.byte_encoder = BYTES_TO_UNICODE; - this.text_encoder = new TextEncoder(); - } + /** + * Creates a new instance of the `ByteLevelPreTokenizer` class. + * @param {Object} config The configuration object. + */ + constructor(config) { + super(); + this.config = config; /** - * Tokenizes a single piece of text using byte-level tokenization. - * @param {string} text The text to tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of tokens. + * @type {boolean} Whether to add a leading space to the first word. + * This allows to treat the leading word just as any other word. */ - pre_tokenize_text(text, options) { - // Add a leading space if the option is enabled - if (this.add_prefix_space && !text.startsWith(' ')) { - text = ' ' + text; - } + this.add_prefix_space = this.config.add_prefix_space; - // Split on whitespace and punctuation - const tokens = this.use_regex ? (text.match(this.pattern) || []) : [text]; + /** + * @type {boolean} Whether the post processing step should trim offsets + * to avoid including whitespaces. + * @todo Use this in the pretokenization step. + */ + this.trim_offsets = this.config.trim_offsets; - // Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) - return tokens.map( - token => Array.from(this.text_encoder.encode(token), byte => this.byte_encoder[byte]).join('') - ); + /** + * @type {boolean} Whether to use the standard GPT2 regex for whitespace splitting. + * Set it to False if you want to use your own splitting. Defaults to true. + */ + this.use_regex = this.config.use_regex ?? true; + this.pattern = + /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu; + + this.byte_encoder = BYTES_TO_UNICODE; + this.text_encoder = new TextEncoder(); + } + + /** + * Tokenizes a single piece of text using byte-level tokenization. + * @param {string} text The text to tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of tokens. + */ + pre_tokenize_text(text, options) { + // Add a leading space if the option is enabled + if (this.add_prefix_space && !text.startsWith(" ")) { + text = " " + text; } + + // Split on whitespace and punctuation + const tokens = this.use_regex ? text.match(this.pattern) || [] : [text]; + + // Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + return tokens.map((token) => + Array.from( + this.text_encoder.encode(token), + (byte) => this.byte_encoder[byte], + ).join(""), + ); + } } /** @@ -1428,39 +1456,39 @@ class ByteLevelPreTokenizer extends PreTokenizer { * @extends PreTokenizer */ class SplitPreTokenizer extends PreTokenizer { - /** - * @param {Object} config The configuration options for the pre-tokenizer. - * @param {Object} config.pattern The pattern used to split the text. Can be a string or a regex object. - * @param {string|undefined} config.pattern.String The string to use for splitting. Only defined if the pattern is a string. - * @param {string|undefined} config.pattern.Regex The regex to use for splitting. Only defined if the pattern is a regex. - * @param {SplitDelimiterBehavior} config.behavior The behavior to use when splitting. - * @param {boolean} config.invert Whether to split (invert=false) or match (invert=true) the pattern. - */ - constructor(config) { - super(); - this.config = config; - // TODO support all behaviours (config.behavior) + /** + * @param {Object} config The configuration options for the pre-tokenizer. + * @param {Object} config.pattern The pattern used to split the text. Can be a string or a regex object. + * @param {string|undefined} config.pattern.String The string to use for splitting. Only defined if the pattern is a string. + * @param {string|undefined} config.pattern.Regex The regex to use for splitting. Only defined if the pattern is a regex. + * @param {SplitDelimiterBehavior} config.behavior The behavior to use when splitting. + * @param {boolean} config.invert Whether to split (invert=false) or match (invert=true) the pattern. + */ + constructor(config) { + super(); + this.config = config; + // TODO support all behaviours (config.behavior) - this.pattern = createPattern(this.config.pattern, this.config.invert); + this.pattern = createPattern(this.config.pattern, this.config.invert); + } + + /** + * Tokenizes text by splitting it using the given pattern. + * @param {string} text The text to tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of tokens. + */ + pre_tokenize_text(text, options) { + if (this.pattern === null) { + return []; } - /** - * Tokenizes text by splitting it using the given pattern. - * @param {string} text The text to tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of tokens. - */ - pre_tokenize_text(text, options) { - if (this.pattern === null) { - return []; - } - - if (this.config.invert) { - return text.match(this.pattern) || []; - } else { - return regexSplit(text, this.pattern); - } + if (this.config.invert) { + return text.match(this.pattern) || []; + } else { + return regexSplit(text, this.pattern); } + } } /** @@ -1468,55 +1496,57 @@ class SplitPreTokenizer extends PreTokenizer { * @extends PreTokenizer */ class PunctuationPreTokenizer extends PreTokenizer { - /** - * @param {Object} config The configuration options for the pre-tokenizer. - * @param {SplitDelimiterBehavior} config.behavior The behavior to use when splitting. - */ - constructor(config) { - super(); - this.config = config; - this.pattern = new RegExp(`[^${PUNCTUATION_REGEX}]+|[${PUNCTUATION_REGEX}]+`, 'gu'); - } + /** + * @param {Object} config The configuration options for the pre-tokenizer. + * @param {SplitDelimiterBehavior} config.behavior The behavior to use when splitting. + */ + constructor(config) { + super(); + this.config = config; + this.pattern = new RegExp( + `[^${PUNCTUATION_REGEX}]+|[${PUNCTUATION_REGEX}]+`, + "gu", + ); + } - /** - * Tokenizes text by splitting it using the given pattern. - * @param {string} text The text to tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of tokens. - */ - pre_tokenize_text(text, options) { - return text.match(this.pattern) || []; - } + /** + * Tokenizes text by splitting it using the given pattern. + * @param {string} text The text to tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of tokens. + */ + pre_tokenize_text(text, options) { + return text.match(this.pattern) || []; + } } - /** * Splits text based on digits. * @extends PreTokenizer */ class DigitsPreTokenizer extends PreTokenizer { - /** - * @param {Object} config The configuration options for the pre-tokenizer. - * @param {boolean} config.individual_digits Whether to split on individual digits. - */ - constructor(config) { - super(); - this.config = config; + /** + * @param {Object} config The configuration options for the pre-tokenizer. + * @param {boolean} config.individual_digits Whether to split on individual digits. + */ + constructor(config) { + super(); + this.config = config; - // Construct a pattern which matches the rust implementation: - const digit_pattern = `[^\\d]+|\\d${this.config.individual_digits ? '' : '+'}`; - this.pattern = new RegExp(digit_pattern, 'gu'); - } + // Construct a pattern which matches the rust implementation: + const digit_pattern = `[^\\d]+|\\d${this.config.individual_digits ? "" : "+"}`; + this.pattern = new RegExp(digit_pattern, "gu"); + } - /** - * Tokenizes text by splitting it using the given pattern. - * @param {string} text The text to tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of tokens. - */ - pre_tokenize_text(text, options) { - return text.match(this.pattern) || []; - } + /** + * Tokenizes text by splitting it using the given pattern. + * @param {string} text The text to tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of tokens. + */ + pre_tokenize_text(text, options) { + return text.match(this.pattern) || []; + } } /** @@ -1525,7 +1555,6 @@ class DigitsPreTokenizer extends PreTokenizer { * @property {number[]} [token_type_ids] List of token type ids produced by the post-processor. */ - /** * @typedef {Object} EncodingSingle * @property {number[]} input_ids List of token ids to be fed to a model. @@ -1533,166 +1562,169 @@ class DigitsPreTokenizer extends PreTokenizer { * @property {number[]} [token_type_ids] List of indices specifying which tokens should be attended to by the model */ - /** * @extends Callable */ class PostProcessor extends Callable { + /** + * @param {Object} config The configuration for the post-processor. + */ + constructor(config) { + super(); + this.config = config; + } - /** - * @param {Object} config The configuration for the post-processor. - */ - constructor(config) { - super(); - this.config = config; + /** + * Factory method to create a PostProcessor object from a configuration object. + * + * @param {Object} config Configuration object representing a PostProcessor. + * @returns {PostProcessor} A PostProcessor object created from the given configuration. + * @throws {Error} If an unknown PostProcessor type is encountered. + */ + static fromConfig(config) { + if (config === null) return null; + switch (config.type) { + case "TemplateProcessing": + return new TemplateProcessing(config); + + case "ByteLevel": + return new ByteLevelPostProcessor(config); + + case "RobertaProcessing": + return new RobertaProcessing(config); + case "BertProcessing": + return new BertProcessing(config); + + default: + throw new Error(`Unknown PostProcessor type: ${config.type}`); } + } - /** - * Factory method to create a PostProcessor object from a configuration object. - * - * @param {Object} config Configuration object representing a PostProcessor. - * @returns {PostProcessor} A PostProcessor object created from the given configuration. - * @throws {Error} If an unknown PostProcessor type is encountered. - */ - static fromConfig(config) { - if (config === null) return null; - switch (config.type) { - case 'TemplateProcessing': - return new TemplateProcessing(config); + /** + * Method to be implemented in subclass to apply post-processing on the given tokens. + * + * @param {Array} tokens The input tokens to be post-processed. + * @param {...*} args Additional arguments required by the post-processing logic. + * @returns {PostProcessedOutput} The post-processed tokens. + * @throws {Error} If the method is not implemented in subclass. + */ + post_process(tokens, ...args) { + throw Error("post_process should be implemented in subclass."); + } - case 'ByteLevel': - return new ByteLevelPostProcessor(config); - - case 'RobertaProcessing': - return new RobertaProcessing(config); - case 'BertProcessing': - return new BertProcessing(config); - - default: - throw new Error(`Unknown PostProcessor type: ${config.type}`); - } - } - - /** - * Method to be implemented in subclass to apply post-processing on the given tokens. - * - * @param {Array} tokens The input tokens to be post-processed. - * @param {...*} args Additional arguments required by the post-processing logic. - * @returns {PostProcessedOutput} The post-processed tokens. - * @throws {Error} If the method is not implemented in subclass. - */ - post_process(tokens, ...args) { - throw Error("post_process should be implemented in subclass.") - } - - /** - * Alias for {@link PostProcessor#post_process}. - * @param {Array} tokens The text or array of texts to post-process. - * @param {...*} args Additional arguments required by the post-processing logic. - * @returns {PostProcessedOutput} The post-processed tokens. - */ - _call(tokens, ...args) { - return this.post_process(tokens, ...args); - } + /** + * Alias for {@link PostProcessor#post_process}. + * @param {Array} tokens The text or array of texts to post-process. + * @param {...*} args Additional arguments required by the post-processing logic. + * @returns {PostProcessedOutput} The post-processed tokens. + */ + _call(tokens, ...args) { + return this.post_process(tokens, ...args); + } } /** * A post-processor that adds special tokens to the beginning and end of the input. */ class BertProcessing extends PostProcessor { - /** - * @param {Object} config The configuration for the post-processor. - * @param {string[]} config.cls The special tokens to add to the beginning of the input. - * @param {string[]} config.sep The special tokens to add to the end of the input. - */ - constructor(config) { - super(config); - // TODO use all of config: add_prefix_space, trim_offsets + /** + * @param {Object} config The configuration for the post-processor. + * @param {string[]} config.cls The special tokens to add to the beginning of the input. + * @param {string[]} config.sep The special tokens to add to the end of the input. + */ + constructor(config) { + super(config); + // TODO use all of config: add_prefix_space, trim_offsets - this.cls = config.cls[0]; - this.sep = config.sep[0]; + this.cls = config.cls[0]; + this.sep = config.sep[0]; + } + + /** + * Adds the special tokens to the beginning and end of the input. + * @param {string[]} tokens The input tokens. + * @param {string[]} [tokens_pair=null] An optional second set of input tokens. + * @returns {PostProcessedOutput} The post-processed tokens with the special tokens added to the beginning and end. + */ + post_process(tokens, tokens_pair = null, { add_special_tokens = true } = {}) { + if (add_special_tokens) { + tokens = mergeArrays([this.cls], tokens, [this.sep]); } - /** - * Adds the special tokens to the beginning and end of the input. - * @param {string[]} tokens The input tokens. - * @param {string[]} [tokens_pair=null] An optional second set of input tokens. - * @returns {PostProcessedOutput} The post-processed tokens with the special tokens added to the beginning and end. - */ - post_process(tokens, tokens_pair = null, { - add_special_tokens = true, - } = {}) { - if (add_special_tokens) { - tokens = mergeArrays([this.cls], tokens, [this.sep]); - } + let token_type_ids = new Array(tokens.length).fill(0); + if (tokens_pair !== null) { + // NOTE: It is intended to add 2 EOS tokens after the first set of tokens + // https://github.com/huggingface/tokenizers/issues/983 + const middle = + add_special_tokens && this instanceof RobertaProcessing + ? [this.sep] + : []; + const after = add_special_tokens ? [this.sep] : []; - let token_type_ids = new Array(tokens.length).fill(0); - if (tokens_pair !== null) { - // NOTE: It is intended to add 2 EOS tokens after the first set of tokens - // https://github.com/huggingface/tokenizers/issues/983 - const middle = (add_special_tokens && this instanceof RobertaProcessing) - ? [this.sep] - : []; - const after = add_special_tokens ? [this.sep] : []; - - tokens = mergeArrays(tokens, middle, tokens_pair, after); - token_type_ids = mergeArrays(token_type_ids, new Array(tokens_pair.length + middle.length + after.length).fill(1)); - } - return { tokens, token_type_ids }; + tokens = mergeArrays(tokens, middle, tokens_pair, after); + token_type_ids = mergeArrays( + token_type_ids, + new Array(tokens_pair.length + middle.length + after.length).fill(1), + ); } + return { tokens, token_type_ids }; + } } -class RobertaProcessing extends BertProcessing { } // NOTE: extends BertProcessing +class RobertaProcessing extends BertProcessing {} // NOTE: extends BertProcessing /** * Post processor that replaces special tokens in a template with actual tokens. * @extends PostProcessor */ class TemplateProcessing extends PostProcessor { - /** - * Creates a new instance of `TemplateProcessing`. - * @param {Object} config The configuration options for the post processor. - * @param {Array} config.single The template for a single sequence of tokens. - * @param {Array} config.pair The template for a pair of sequences of tokens. - */ - constructor(config) { - super(config); + /** + * Creates a new instance of `TemplateProcessing`. + * @param {Object} config The configuration options for the post processor. + * @param {Array} config.single The template for a single sequence of tokens. + * @param {Array} config.pair The template for a pair of sequences of tokens. + */ + constructor(config) { + super(config); - this.single = config.single; - this.pair = config.pair; - } + this.single = config.single; + this.pair = config.pair; + } - /** - * Replaces special tokens in the template with actual tokens. - * @param {string[]} tokens The list of tokens for the first sequence. - * @param {string[]} [tokens_pair=null] The list of tokens for the second sequence (optional). - * @returns {PostProcessedOutput} An object containing the list of tokens with the special tokens replaced with actual tokens. - */ - post_process(tokens, tokens_pair = null, { - add_special_tokens = true, - } = {}) { - const type = tokens_pair === null ? this.single : this.pair + /** + * Replaces special tokens in the template with actual tokens. + * @param {string[]} tokens The list of tokens for the first sequence. + * @param {string[]} [tokens_pair=null] The list of tokens for the second sequence (optional). + * @returns {PostProcessedOutput} An object containing the list of tokens with the special tokens replaced with actual tokens. + */ + post_process(tokens, tokens_pair = null, { add_special_tokens = true } = {}) { + const type = tokens_pair === null ? this.single : this.pair; - let processedTokens = []; - let types = []; - for (const item of type) { - if ('SpecialToken' in item) { - if (add_special_tokens) { - processedTokens.push(item.SpecialToken.id); - types.push(item.SpecialToken.type_id); - } - } else if ('Sequence' in item) { - if (item.Sequence.id === 'A') { - processedTokens = mergeArrays(processedTokens, tokens); - types = mergeArrays(types, new Array(tokens.length).fill(item.Sequence.type_id)); - - } else if (item.Sequence.id === 'B') { - processedTokens = mergeArrays(processedTokens, tokens_pair); - types = mergeArrays(types, new Array(tokens_pair.length).fill(item.Sequence.type_id)); - } - } + let processedTokens = []; + let types = []; + for (const item of type) { + if ("SpecialToken" in item) { + if (add_special_tokens) { + processedTokens.push(item.SpecialToken.id); + types.push(item.SpecialToken.type_id); } - return { tokens: processedTokens, token_type_ids: types }; + } else if ("Sequence" in item) { + if (item.Sequence.id === "A") { + processedTokens = mergeArrays(processedTokens, tokens); + types = mergeArrays( + types, + new Array(tokens.length).fill(item.Sequence.type_id), + ); + } else if (item.Sequence.id === "B") { + processedTokens = mergeArrays(processedTokens, tokens_pair); + types = mergeArrays( + types, + new Array(tokens_pair.length).fill(item.Sequence.type_id), + ); + } + } } + return { tokens: processedTokens, token_type_ids: types }; + } } /** @@ -1700,18 +1732,18 @@ class TemplateProcessing extends PostProcessor { * @extends PostProcessor */ class ByteLevelPostProcessor extends PostProcessor { - /** - * Post process the given tokens. - * @param {string[]} tokens The list of tokens for the first sequence. - * @param {string[]} [tokens_pair=null] The list of tokens for the second sequence (optional). - * @returns {PostProcessedOutput} An object containing the post-processed tokens. - */ - post_process(tokens, tokens_pair = null) { - if (tokens_pair) { - tokens = mergeArrays(tokens, tokens_pair); - } - return { tokens }; + /** + * Post process the given tokens. + * @param {string[]} tokens The list of tokens for the first sequence. + * @param {string[]} [tokens_pair=null] The list of tokens for the second sequence (optional). + * @returns {PostProcessedOutput} An object containing the post-processed tokens. + */ + post_process(tokens, tokens_pair = null) { + if (tokens_pair) { + tokens = mergeArrays(tokens, tokens_pair); } + return { tokens }; + } } /** @@ -1719,144 +1751,147 @@ class ByteLevelPostProcessor extends PostProcessor { * @extends Callable */ class Decoder extends Callable { + /** + * Creates an instance of `Decoder`. + * + * @param {Object} config The configuration object. + */ + constructor(config) { + super(); + this.config = config; - /** - * Creates an instance of `Decoder`. - * - * @param {Object} config The configuration object. - */ - constructor(config) { - super(); - this.config = config; + /** @type {AddedToken[]} */ + this.added_tokens = []; + this.end_of_word_suffix = null; + this.trim_offsets = config.trim_offsets; + } - /** @type {AddedToken[]} */ - this.added_tokens = []; - this.end_of_word_suffix = null; - this.trim_offsets = config.trim_offsets; - } - - /** + /** * Creates a decoder instance based on the provided configuration. * * @param {Object} config The configuration object. * @returns {Decoder} A decoder instance. * @throws {Error} If an unknown decoder type is provided. */ - static fromConfig(config) { - if (config === null) return null; - switch (config.type) { - case 'WordPiece': - return new WordPieceDecoder(config); - case 'Metaspace': - return new MetaspaceDecoder(config); - case 'ByteLevel': - return new ByteLevelDecoder(config); + static fromConfig(config) { + if (config === null) return null; + switch (config.type) { + case "WordPiece": + return new WordPieceDecoder(config); + case "Metaspace": + return new MetaspaceDecoder(config); + case "ByteLevel": + return new ByteLevelDecoder(config); - case 'Replace': - return new ReplaceDecoder(config); - case 'ByteFallback': - return new ByteFallback(config); - case 'Fuse': - return new FuseDecoder(config); - case 'Strip': - return new StripDecoder(config); + case "Replace": + return new ReplaceDecoder(config); + case "ByteFallback": + return new ByteFallback(config); + case "Fuse": + return new FuseDecoder(config); + case "Strip": + return new StripDecoder(config); - case 'Sequence': - return new DecoderSequence(config); + case "Sequence": + return new DecoderSequence(config); - case 'CTC': - return new CTCDecoder(config); - case 'BPEDecoder': - return new BPEDecoder(config); - default: - throw new Error(`Unknown Decoder type: ${config.type}`); - } + case "CTC": + return new CTCDecoder(config); + case "BPEDecoder": + return new BPEDecoder(config); + default: + throw new Error(`Unknown Decoder type: ${config.type}`); } + } - /** - * Calls the `decode` method. - * - * @param {string[]} tokens The list of tokens. - * @returns {string} The decoded string. - */ - _call(tokens) { - return this.decode(tokens); - } + /** + * Calls the `decode` method. + * + * @param {string[]} tokens The list of tokens. + * @returns {string} The decoded string. + */ + _call(tokens) { + return this.decode(tokens); + } - /** - * Decodes a list of tokens. - * @param {string[]} tokens The list of tokens. - * @returns {string} The decoded string. - */ - decode(tokens) { - return this.decode_chain(tokens).join(''); - } - - /** - * Apply the decoder to a list of tokens. - * - * @param {string[]} tokens The list of tokens. - * @returns {string[]} The decoded list of tokens. - * @throws {Error} If the `decode_chain` method is not implemented in the subclass. - */ - decode_chain(tokens) { - throw Error("`decode_chain` should be implemented in subclass.") - } + /** + * Decodes a list of tokens. + * @param {string[]} tokens The list of tokens. + * @returns {string} The decoded string. + */ + decode(tokens) { + return this.decode_chain(tokens).join(""); + } + /** + * Apply the decoder to a list of tokens. + * + * @param {string[]} tokens The list of tokens. + * @returns {string[]} The decoded list of tokens. + * @throws {Error} If the `decode_chain` method is not implemented in the subclass. + */ + decode_chain(tokens) { + throw Error("`decode_chain` should be implemented in subclass."); + } } class ReplaceDecoder extends Decoder { - - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - const pattern = createPattern(this.config.pattern); - return pattern === null - ? tokens - : tokens.map(token => token.replaceAll(pattern, this.config.content)) - } + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + const pattern = createPattern(this.config.pattern); + return pattern === null + ? tokens + : tokens.map((token) => token.replaceAll(pattern, this.config.content)); + } } - class ByteFallback extends Decoder { - constructor(config) { - super(config); + constructor(config) { + super(config); - this.text_decoder = new TextDecoder(); - } + this.text_decoder = new TextDecoder(); + } - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + const new_tokens = []; + let previous_byte_tokens = []; - const new_tokens = []; - let previous_byte_tokens = []; - - for (const token of tokens) { - let bytes = null; - if (token.length === 6 && token.startsWith('<0x') && token.endsWith('>')) { - const byte = parseInt(token.slice(3, 5), 16); - if (!isNaN(byte)) { - bytes = byte; - } - } - if (bytes !== null) { - previous_byte_tokens.push(bytes); - } else { - if (previous_byte_tokens.length > 0) { - const string = this.text_decoder.decode(Uint8Array.from(previous_byte_tokens)); - new_tokens.push(string); - previous_byte_tokens = []; - } - new_tokens.push(token); - } + for (const token of tokens) { + let bytes = null; + if ( + token.length === 6 && + token.startsWith("<0x") && + token.endsWith(">") + ) { + const byte = parseInt(token.slice(3, 5), 16); + if (!isNaN(byte)) { + bytes = byte; } + } + if (bytes !== null) { + previous_byte_tokens.push(bytes); + } else { if (previous_byte_tokens.length > 0) { - const string = this.text_decoder.decode(Uint8Array.from(previous_byte_tokens)); - new_tokens.push(string); - previous_byte_tokens = []; + const string = this.text_decoder.decode( + Uint8Array.from(previous_byte_tokens), + ); + new_tokens.push(string); + previous_byte_tokens = []; } - - return new_tokens; + new_tokens.push(token); + } } + if (previous_byte_tokens.length > 0) { + const string = this.text_decoder.decode( + Uint8Array.from(previous_byte_tokens), + ); + new_tokens.push(string); + previous_byte_tokens = []; + } + + return new_tokens; + } } /** @@ -1865,50 +1900,48 @@ class ByteFallback extends Decoder { * exists incase some decoders need to happen after that step */ class FuseDecoder extends Decoder { - - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - return [tokens.join('')]; - } + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + return [tokens.join("")]; + } } - class StripDecoder extends Decoder { - constructor(config) { - super(config); + constructor(config) { + super(config); - this.content = this.config.content; - this.start = this.config.start; - this.stop = this.config.stop; - } + this.content = this.config.content; + this.start = this.config.start; + this.stop = this.config.stop; + } - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - return tokens.map(token => { - let start_cut = 0; - for (let i = 0; i < this.start; ++i) { - if (token[i] === this.content) { - start_cut = i + 1; - continue; - } else { - break; - } - } + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + return tokens.map((token) => { + let start_cut = 0; + for (let i = 0; i < this.start; ++i) { + if (token[i] === this.content) { + start_cut = i + 1; + continue; + } else { + break; + } + } - let stop_cut = token.length; - for (let i = 0; i < this.stop; ++i) { - const index = token.length - i - 1; - if (token[index] === this.content) { - stop_cut = index; - continue; - } else { - break; - } - } + let stop_cut = token.length; + for (let i = 0; i < this.stop; ++i) { + const index = token.length - i - 1; + if (token[index] === this.content) { + stop_cut = index; + continue; + } else { + break; + } + } - return token.slice(start_cut, stop_cut) - }); - } + return token.slice(start_cut, stop_cut); + }); + } } /** @@ -1916,36 +1949,35 @@ class StripDecoder extends Decoder { * @extends Decoder */ class WordPieceDecoder extends Decoder { + /** + * Creates a new instance of WordPieceDecoder. + * @param {Object} config The configuration object. + * @param {string} config.prefix The prefix used for WordPiece encoding. + * @param {boolean} config.cleanup Whether to cleanup the decoded string. + */ + constructor(config) { + super(config); + this.cleanup = config.cleanup; + } - /** - * Creates a new instance of WordPieceDecoder. - * @param {Object} config The configuration object. - * @param {string} config.prefix The prefix used for WordPiece encoding. - * @param {boolean} config.cleanup Whether to cleanup the decoded string. - */ - constructor(config) { - super(config); - this.cleanup = config.cleanup; - } + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + return tokens.map((token, i) => { + if (i !== 0) { + if (token.startsWith(this.config.prefix)) { + // NOTE: .replace() is intended; only replace first occurrence + token = token.replace(this.config.prefix, ""); + } else { + token = " " + token; + } + } + if (this.cleanup) { + token = clean_up_tokenization(token); + } - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - return tokens.map((token, i) => { - if (i !== 0) { - if (token.startsWith(this.config.prefix)) { - // NOTE: .replace() is intended; only replace first occurrence - token = token.replace(this.config.prefix, ''); - } else { - token = ' ' + token; - } - } - if (this.cleanup) { - token = clean_up_tokenization(token) - } - - return token; - }); - } + return token; + }); + } } /** @@ -1953,69 +1985,70 @@ class WordPieceDecoder extends Decoder { * @extends Decoder */ class ByteLevelDecoder extends Decoder { + /** + * Create a `ByteLevelDecoder` object. + * @param {Object} config Configuration object. + */ + constructor(config) { + super(config); - /** - * Create a `ByteLevelDecoder` object. - * @param {Object} config Configuration object. - */ - constructor(config) { - super(config); + this.byte_decoder = UNICODE_TO_BYTES; + this.text_decoder = new TextDecoder("utf-8", { + fatal: false, + ignoreBOM: true, + }); - this.byte_decoder = UNICODE_TO_BYTES; - this.text_decoder = new TextDecoder("utf-8", { - fatal: false, - ignoreBOM: true, - }); + this.end_of_word_suffix = null; + } - this.end_of_word_suffix = null; - } + /** + * Convert an array of tokens to string by decoding each byte. + * @param {string[]} tokens Array of tokens to be decoded. + * @returns {string} The decoded string. + */ + convert_tokens_to_string(tokens) { + const text = tokens.join(""); + const byteArray = new Uint8Array( + [...text].map((c) => this.byte_decoder[c]), + ); + const decoded_text = this.text_decoder.decode(byteArray); + return decoded_text; + } - /** - * Convert an array of tokens to string by decoding each byte. - * @param {string[]} tokens Array of tokens to be decoded. - * @returns {string} The decoded string. - */ - convert_tokens_to_string(tokens) { - const text = tokens.join(''); - const byteArray = new Uint8Array([...text].map(c => this.byte_decoder[c])); - const decoded_text = this.text_decoder.decode(byteArray); - return decoded_text; - } + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + // TODO move to base class (like HF) + // tokens === filtered_tokens - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - // TODO move to base class (like HF) - // tokens === filtered_tokens + // To avoid mixing byte-level and unicode for byte-level BPT + // we need to build string separately for added tokens and byte-level tokens + // cf. https://github.com/huggingface/transformers/issues/1133 + const sub_texts = []; + let current_sub_text = []; + for (const token of tokens) { + // tokens sent here are already filtered, so we don't need to do this + // if (skip_special_tokens && this.all_special_ids.includes(token)) { + // continue; + // } - // To avoid mixing byte-level and unicode for byte-level BPT - // we need to build string separately for added tokens and byte-level tokens - // cf. https://github.com/huggingface/transformers/issues/1133 - const sub_texts = []; - let current_sub_text = []; - for (const token of tokens) { - // tokens sent here are already filtered, so we don't need to do this - // if (skip_special_tokens && this.all_special_ids.includes(token)) { - // continue; - // } - - if (this.added_tokens.find(x => x.content === token) !== undefined) { - if (current_sub_text.length > 0) { - sub_texts.push(this.convert_tokens_to_string(current_sub_text)); - current_sub_text = []; - } - sub_texts.push(token); - } else { - current_sub_text.push(token); - } - } + if (this.added_tokens.find((x) => x.content === token) !== undefined) { if (current_sub_text.length > 0) { - sub_texts.push(this.convert_tokens_to_string(current_sub_text)); + sub_texts.push(this.convert_tokens_to_string(current_sub_text)); + current_sub_text = []; } - - // TODO add spaces_between_special_tokens and clean_up_tokenization_spaces options - - return sub_texts; + sub_texts.push(token); + } else { + current_sub_text.push(token); + } } + if (current_sub_text.length > 0) { + sub_texts.push(this.convert_tokens_to_string(current_sub_text)); + } + + // TODO add spaces_between_special_tokens and clean_up_tokenization_spaces options + + return sub_texts; + } } /** @@ -2023,48 +2056,48 @@ class ByteLevelDecoder extends Decoder { * See https://github.com/huggingface/tokenizers/blob/bb38f390a61883fc2f29d659af696f428d1cda6b/tokenizers/src/decoders/ctc.rs */ class CTCDecoder extends Decoder { + constructor(config) { + super(config); - constructor(config) { - super(config); + this.pad_token = this.config.pad_token; + this.word_delimiter_token = this.config.word_delimiter_token; + this.cleanup = this.config.cleanup; + } + /** + * Converts a connectionist-temporal-classification (CTC) output tokens into a single string. + * @param {string[]} tokens Array of tokens to be decoded. + * @returns {string} The decoded string. + */ + convert_tokens_to_string(tokens) { + if (tokens.length === 0) return ""; - this.pad_token = this.config.pad_token; - this.word_delimiter_token = this.config.word_delimiter_token; - this.cleanup = this.config.cleanup; - } - /** - * Converts a connectionist-temporal-classification (CTC) output tokens into a single string. - * @param {string[]} tokens Array of tokens to be decoded. - * @returns {string} The decoded string. - */ - convert_tokens_to_string(tokens) { - if (tokens.length === 0) return ''; - - // group same tokens into non-repeating tokens in CTC style decoding - const grouped_tokens = [tokens[0]]; - for (let i = 1; i < tokens.length; ++i) { - if (tokens[i] !== grouped_tokens.at(-1)) { - grouped_tokens.push(tokens[i]); - } - } - - // filter self.pad_token which is used as CTC-blank token - const filtered_tokens = grouped_tokens.filter(token => token !== this.pad_token); - - let text = filtered_tokens.join(''); - if (this.cleanup) { - // cleanup and replace delimiter token - text = clean_up_tokenization(text) - .replaceAll(this.word_delimiter_token, ' ') - .trim(); - } - return text; + // group same tokens into non-repeating tokens in CTC style decoding + const grouped_tokens = [tokens[0]]; + for (let i = 1; i < tokens.length; ++i) { + if (tokens[i] !== grouped_tokens.at(-1)) { + grouped_tokens.push(tokens[i]); + } } + // filter self.pad_token which is used as CTC-blank token + const filtered_tokens = grouped_tokens.filter( + (token) => token !== this.pad_token, + ); - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - return [this.convert_tokens_to_string(tokens)]; + let text = filtered_tokens.join(""); + if (this.cleanup) { + // cleanup and replace delimiter token + text = clean_up_tokenization(text) + .replaceAll(this.word_delimiter_token, " ") + .trim(); } + return text; + } + + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + return [this.convert_tokens_to_string(tokens)]; + } } /** @@ -2072,108 +2105,100 @@ class CTCDecoder extends Decoder { * @extends Decoder */ class DecoderSequence extends Decoder { + /** + * Creates a new instance of DecoderSequence. + * @param {Object} config The configuration object. + * @param {Decoder[]} config.decoders The list of decoders to apply. + */ + constructor(config) { + super(config); + this.decoders = config.decoders.map((x) => Decoder.fromConfig(x)); + } - /** - * Creates a new instance of DecoderSequence. - * @param {Object} config The configuration object. - * @param {Decoder[]} config.decoders The list of decoders to apply. - */ - constructor(config) { - super(config); - this.decoders = config.decoders.map(x => Decoder.fromConfig(x)); - } - - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - // Use reduce to apply each decoder to the tokens - return this.decoders.reduce((toks, decoder) => { - return decoder.decode_chain(toks); - }, tokens); - } - + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + // Use reduce to apply each decoder to the tokens + return this.decoders.reduce((toks, decoder) => { + return decoder.decode_chain(toks); + }, tokens); + } } class BPEDecoder extends Decoder { - constructor(config) { - super(config); + constructor(config) { + super(config); - this.suffix = this.config.suffix; - } - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - return tokens.map((token, i) => { - return token.replaceAll(this.suffix, (i === tokens.length - 1) ? '' : ' ') - }); - } + this.suffix = this.config.suffix; + } + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + return tokens.map((token, i) => { + return token.replaceAll(this.suffix, i === tokens.length - 1 ? "" : " "); + }); + } } // Custom decoder for VITS class VitsDecoder extends Decoder { - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - let decoded = ''; - for (let i = 1; i < tokens.length; i += 2) { - decoded += tokens[i]; - } - return [decoded]; + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + let decoded = ""; + for (let i = 1; i < tokens.length; i += 2) { + decoded += tokens[i]; } + return [decoded]; + } } - /** * This PreTokenizer replaces spaces with the given replacement character, adds a prefix space if requested, * and returns a list of tokens. * @extends PreTokenizer */ class MetaspacePreTokenizer extends PreTokenizer { - /** - * @param {Object} config The configuration object for the MetaspacePreTokenizer. - * @param {boolean} config.add_prefix_space Whether to add a prefix space to the first token. - * @param {string} config.replacement The character to replace spaces with. - * @param {string} [config.str_rep=config.replacement] An optional string representation of the replacement character. - * @param {'first'|'never'|'always'} [config.prepend_scheme='always'] The metaspace prepending scheme. - */ - constructor(config) { - super(); + /** + * @param {Object} config The configuration object for the MetaspacePreTokenizer. + * @param {boolean} config.add_prefix_space Whether to add a prefix space to the first token. + * @param {string} config.replacement The character to replace spaces with. + * @param {string} [config.str_rep=config.replacement] An optional string representation of the replacement character. + * @param {'first'|'never'|'always'} [config.prepend_scheme='always'] The metaspace prepending scheme. + */ + constructor(config) { + super(); - this.addPrefixSpace = config.add_prefix_space; - this.replacement = config.replacement; - this.strRep = config.str_rep || this.replacement; - this.prepend_scheme = config.prepend_scheme ?? 'always'; - } - - /** - * This method takes a string, replaces spaces with the replacement character, - * adds a prefix space if requested, and returns a new list of tokens. - * @param {string} text The text to pre-tokenize. - * @param {Object} [options] The options for the pre-tokenization. - * @param {number} [options.section_index] The index of the section to pre-tokenize. - * @returns {string[]} A new list of pre-tokenized tokens. - */ - pre_tokenize_text(text, { - section_index = undefined, - } = {}) { - - let normalized = text.replaceAll(' ', this.strRep); - - if ( - // We add a prefix space if: - // (1) The addPrefixSpace option is enabled and the normalized - // token does not already start with the replacement character. - (this.addPrefixSpace && !normalized.startsWith(this.replacement)) - - // and (2) either: - // (a) prepend_scheme is 'always' - // (b) prepend_scheme is 'first' and this is the first section - && ( - this.prepend_scheme === 'always' || - (this.prepend_scheme === 'first' && section_index === 0) - ) - ) { - normalized = this.strRep + normalized; - } - return [normalized]; + this.addPrefixSpace = config.add_prefix_space; + this.replacement = config.replacement; + this.strRep = config.str_rep || this.replacement; + this.prepend_scheme = config.prepend_scheme ?? "always"; + } + + /** + * This method takes a string, replaces spaces with the replacement character, + * adds a prefix space if requested, and returns a new list of tokens. + * @param {string} text The text to pre-tokenize. + * @param {Object} [options] The options for the pre-tokenization. + * @param {number} [options.section_index] The index of the section to pre-tokenize. + * @returns {string[]} A new list of pre-tokenized tokens. + */ + pre_tokenize_text(text, { section_index = undefined } = {}) { + let normalized = text.replaceAll(" ", this.strRep); + + if ( + // We add a prefix space if: + // (1) The addPrefixSpace option is enabled and the normalized + // token does not already start with the replacement character. + this.addPrefixSpace && + !normalized.startsWith(this.replacement) && + // and (2) either: + // (a) prepend_scheme is 'always' + // (b) prepend_scheme is 'first' and this is the first section + (this.prepend_scheme === "always" || + (this.prepend_scheme === "first" && section_index === 0)) + ) { + normalized = this.strRep + normalized; } + return [normalized]; + } } /** @@ -2181,31 +2206,31 @@ class MetaspacePreTokenizer extends PreTokenizer { * @extends Decoder */ class MetaspaceDecoder extends Decoder { - /** - * Constructs a new MetaspaceDecoder object. - * @param {Object} config The configuration object for the MetaspaceDecoder. - * @param {boolean} config.add_prefix_space Whether to add a prefix space to the decoded string. - * @param {string} config.replacement The string to replace spaces with. - */ - constructor(config) { - super(config); + /** + * Constructs a new MetaspaceDecoder object. + * @param {Object} config The configuration object for the MetaspaceDecoder. + * @param {boolean} config.add_prefix_space Whether to add a prefix space to the decoded string. + * @param {string} config.replacement The string to replace spaces with. + */ + constructor(config) { + super(config); - this.addPrefixSpace = config.add_prefix_space; - this.replacement = config.replacement; - } + this.addPrefixSpace = config.add_prefix_space; + this.replacement = config.replacement; + } - /** @type {Decoder['decode_chain']} */ - decode_chain(tokens) { - const result = []; - for (let i = 0; i < tokens.length; ++i) { - let normalized = tokens[i].replaceAll(this.replacement, ' '); - if (this.addPrefixSpace && i == 0 && normalized.startsWith(' ')) { - normalized = normalized.substring(1); - } - result.push(normalized); - } - return result; + /** @type {Decoder['decode_chain']} */ + decode_chain(tokens) { + const result = []; + for (let i = 0; i < tokens.length; ++i) { + let normalized = tokens[i].replaceAll(this.replacement, " "); + if (this.addPrefixSpace && i == 0 && normalized.startsWith(" ")) { + normalized = normalized.substring(1); + } + result.push(normalized); } + return result; + } } /** @@ -2216,50 +2241,56 @@ class MetaspaceDecoder extends Decoder { * @param {Object} config.precompiled_charsmap The precompiled charsmap object. */ class Precompiled extends Normalizer { - /** - * Create a new instance of Precompiled normalizer. - * @param {Object} config The configuration object. - * @param {any} config.precompiled_charsmap Precompiled chars mapping. - */ - constructor(config) { - super(config); - this.charsmap = config.precompiled_charsmap; + /** + * Create a new instance of Precompiled normalizer. + * @param {Object} config The configuration object. + * @param {any} config.precompiled_charsmap Precompiled chars mapping. + */ + constructor(config) { + super(config); + this.charsmap = config.precompiled_charsmap; + } + + /** + * Normalizes the given text by applying the precompiled charsmap. + * @param {string} text The text to normalize. + * @returns {string} The normalized text. + */ + normalize(text) { + // As stated in the sentencepiece normalization docs (https://github.com/google/sentencepiece/blob/master/doc/normalization.md#use-pre-defined-normalization-rule), + // there are 5 pre-defined normalization rules: + // 1. nmt_nfkc: NFKC normalization with some additional normalization around spaces. (default) + // 2. nfkc: original NFKC normalization. + // 3. nmt_nfkc_cf: nmt_nfkc + Unicode case folding (mostly lower casing) + // 4. nfkc_cf: nfkc + Unicode case folding. + // 5. identity: no normalization + // + // For now, we only implement the default (nmt_nfkc). + // See https://raw.githubusercontent.com/google/sentencepiece/master/data/nmt_nfkc.tsv for the full list of rules. + // TODO: detect when a different `this.charsmap` is used. + + text = text.replace( + /[\u0001-\u0008\u000B\u000E-\u001F\u007F\u008F\u009F]/gm, + "", + ); // Remove control characters + text = text.replace( + /[\u0009\u000A\u000C\u000D\u1680\u200B\u200C\u200E\u200F\u2028\u2029\u2581\uFEFF\uFFFD]/gm, + "\u0020", + ); // Replace certain characters with a space + + if (text.includes("\uFF5E")) { + // To match the sentencepiece implementation 100%, we must handle a very strange edge-case. + // For some reason, the "Fullwidth Tilde" character (\uFF5E) should not be converted to the standard Tilde character (\u007E). + // However, NFKC normalization does do this conversion. As a result, we split the string on the Fullwidth Tilde character, + // perform NFKC normalization on each substring, and then join them back together with the Fullwidth Tilde character. + const parts = text.split("\uFF5E"); + text = parts.map((part) => part.normalize("NFKC")).join("\uFF5E"); + } else { + text = text.normalize("NFKC"); } - /** - * Normalizes the given text by applying the precompiled charsmap. - * @param {string} text The text to normalize. - * @returns {string} The normalized text. - */ - normalize(text) { - // As stated in the sentencepiece normalization docs (https://github.com/google/sentencepiece/blob/master/doc/normalization.md#use-pre-defined-normalization-rule), - // there are 5 pre-defined normalization rules: - // 1. nmt_nfkc: NFKC normalization with some additional normalization around spaces. (default) - // 2. nfkc: original NFKC normalization. - // 3. nmt_nfkc_cf: nmt_nfkc + Unicode case folding (mostly lower casing) - // 4. nfkc_cf: nfkc + Unicode case folding. - // 5. identity: no normalization - // - // For now, we only implement the default (nmt_nfkc). - // See https://raw.githubusercontent.com/google/sentencepiece/master/data/nmt_nfkc.tsv for the full list of rules. - // TODO: detect when a different `this.charsmap` is used. - - text = text.replace(/[\u0001-\u0008\u000B\u000E-\u001F\u007F\u008F\u009F]/gm, ''); // Remove control characters - text = text.replace(/[\u0009\u000A\u000C\u000D\u1680\u200B\u200C\u200E\u200F\u2028\u2029\u2581\uFEFF\uFFFD]/gm, '\u0020'); // Replace certain characters with a space - - if (text.includes('\uFF5E')) { - // To match the sentencepiece implementation 100%, we must handle a very strange edge-case. - // For some reason, the "Fullwidth Tilde" character (\uFF5E) should not be converted to the standard Tilde character (\u007E). - // However, NFKC normalization does do this conversion. As a result, we split the string on the Fullwidth Tilde character, - // perform NFKC normalization on each substring, and then join them back together with the Fullwidth Tilde character. - const parts = text.split('\uFF5E'); - text = parts.map(part => part.normalize('NFKC')).join('\uFF5E'); - } else { - text = text.normalize('NFKC'); - } - - return text; - } + return text; + } } /** @@ -2267,28 +2298,33 @@ class Precompiled extends Normalizer { * @extends PreTokenizer */ class PreTokenizerSequence extends PreTokenizer { - /** - * Creates an instance of PreTokenizerSequence. - * @param {Object} config The configuration object for the pre-tokenizer sequence. - * @param {Object[]} config.pretokenizers An array of pre-tokenizer configurations. - */ - constructor(config) { - super(); - this.tokenizers = config.pretokenizers.map(x => PreTokenizer.fromConfig(x)); - } + /** + * Creates an instance of PreTokenizerSequence. + * @param {Object} config The configuration object for the pre-tokenizer sequence. + * @param {Object[]} config.pretokenizers An array of pre-tokenizer configurations. + */ + constructor(config) { + super(); + this.tokenizers = config.pretokenizers.map((x) => + PreTokenizer.fromConfig(x), + ); + } - /** - * Applies each pre-tokenizer in the sequence to the input text in turn. - * @param {string} text The text to pre-tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} The pre-tokenized text. - */ - pre_tokenize_text(text, options) { - // Use reduce to apply each tokenizer to the text - return this.tokenizers.reduce((preTokenizedText, tokenizer) => { - return tokenizer.pre_tokenize(preTokenizedText, options); - }, [text]); - } + /** + * Applies each pre-tokenizer in the sequence to the input text in turn. + * @param {string} text The text to pre-tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} The pre-tokenized text. + */ + pre_tokenize_text(text, options) { + // Use reduce to apply each tokenizer to the text + return this.tokenizers.reduce( + (preTokenizedText, tokenizer) => { + return tokenizer.pre_tokenize(preTokenizedText, options); + }, + [text], + ); + } } /** @@ -2296,65 +2332,65 @@ class PreTokenizerSequence extends PreTokenizer { * @extends PreTokenizer */ class WhitespaceSplit extends PreTokenizer { - /** - * Creates an instance of WhitespaceSplit. - * @param {Object} config The configuration object for the pre-tokenizer sequence. - */ - constructor(config) { - super(); - } - /** - * Pre-tokenizes the input text by splitting it on whitespace characters. - * @param {string} text The text to be pre-tokenized. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of tokens produced by splitting the input text on whitespace. - */ - pre_tokenize_text(text, options) { - return whitespace_split(text); - } + /** + * Creates an instance of WhitespaceSplit. + * @param {Object} config The configuration object for the pre-tokenizer sequence. + */ + constructor(config) { + super(); + } + /** + * Pre-tokenizes the input text by splitting it on whitespace characters. + * @param {string} text The text to be pre-tokenized. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of tokens produced by splitting the input text on whitespace. + */ + pre_tokenize_text(text, options) { + return whitespace_split(text); + } } // NOTE: `ReplacePreTokenizer` is custom (to support `BlenderbotSmallTokenizer`) class ReplacePreTokenizer extends PreTokenizer { - /** - * @param {Object} config The configuration options for the pre-tokenizer. - * @param {Object} config.pattern The pattern used to split the text. Can be a string or a regex object. - * @param {string} config.content What to replace the pattern with. - */ - constructor(config) { - super(); - this.config = config; - this.pattern = createPattern(this.config.pattern); - this.content = this.config.content; - } + /** + * @param {Object} config The configuration options for the pre-tokenizer. + * @param {Object} config.pattern The pattern used to split the text. Can be a string or a regex object. + * @param {string} config.content What to replace the pattern with. + */ + constructor(config) { + super(); + this.config = config; + this.pattern = createPattern(this.config.pattern); + this.content = this.config.content; + } - /** - * Pre-tokenizes the input text by replacing certain characters. - * @param {string} text The text to be pre-tokenized. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of tokens produced by replacing certain characters. - */ - pre_tokenize_text(text, options) { - if (this.pattern === null) { - return [text]; - } - return [text.replaceAll(this.pattern, this.config.content)]; + /** + * Pre-tokenizes the input text by replacing certain characters. + * @param {string} text The text to be pre-tokenized. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of tokens produced by replacing certain characters. + */ + pre_tokenize_text(text, options) { + if (this.pattern === null) { + return [text]; } + return [text.replaceAll(this.pattern, this.config.content)]; + } } const SPECIAL_TOKEN_ATTRIBUTES = [ - 'bos_token', - 'eos_token', - 'unk_token', - 'sep_token', - 'pad_token', - 'cls_token', - 'mask_token', - // additional_special_tokens (TODO) -] + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + // additional_special_tokens (TODO) +]; /** - * + * * Helper function for padding values of an object, which are each arrays. * NOTE: No additional checks are made here for validity of arguments. * @param {Record} item The input object. @@ -2364,15 +2400,16 @@ const SPECIAL_TOKEN_ATTRIBUTES = [ * @private */ function padHelper(item, length, value_fn, side) { - for (const key of Object.keys(item)) { - const diff = length - item[key].length; - const value = value_fn(key); + for (const key of Object.keys(item)) { + const diff = length - item[key].length; + const value = value_fn(key); - const padData = new Array(diff).fill(value); - item[key] = side === 'right' - ? mergeArrays(item[key], padData) - : mergeArrays(padData, item[key]); - } + const padData = new Array(diff).fill(value); + item[key] = + side === "right" + ? mergeArrays(item[key], padData) + : mergeArrays(padData, item[key]); + } } /** @@ -2383,622 +2420,643 @@ function padHelper(item, length, value_fn, side) { * @private */ function truncateHelper(item, length) { - // Setting .length to a lower value truncates the array in-place: - // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/length - for (const key of Object.keys(item)) { - item[key].length = length; - } + // Setting .length to a lower value truncates the array in-place: + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/length + for (const key of Object.keys(item)) { + item[key].length = length; + } } - export class PreTrainedTokenizer extends Callable { - return_token_type_ids = false; + return_token_type_ids = false; - _default_chat_template = `{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}`; + _default_chat_template = `{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}`; - /** - * Create a new PreTrainedTokenizer instance. - * @param {Object} tokenizerJSON The JSON of the tokenizer. - * @param {Object} tokenizerConfig The config of the tokenizer. - */ - constructor(tokenizerJSON, tokenizerConfig) { - super(); + /** + * Create a new PreTrainedTokenizer instance. + * @param {Object} tokenizerJSON The JSON of the tokenizer. + * @param {Object} tokenizerConfig The config of the tokenizer. + */ + constructor(tokenizerJSON, tokenizerConfig) { + super(); - this._tokenizer_config = tokenizerConfig; + this._tokenizer_config = tokenizerConfig; - // Construct parts of the tokenizer from the JSON - this.normalizer = Normalizer.fromConfig(tokenizerJSON.normalizer); - this.pre_tokenizer = PreTokenizer.fromConfig(tokenizerJSON.pre_tokenizer); - this.model = TokenizerModel.fromConfig(tokenizerJSON.model, tokenizerConfig); - this.post_processor = PostProcessor.fromConfig(tokenizerJSON.post_processor); - this.decoder = Decoder.fromConfig(tokenizerJSON.decoder); + // Construct parts of the tokenizer from the JSON + this.normalizer = Normalizer.fromConfig(tokenizerJSON.normalizer); + this.pre_tokenizer = PreTokenizer.fromConfig(tokenizerJSON.pre_tokenizer); + this.model = TokenizerModel.fromConfig( + tokenizerJSON.model, + tokenizerConfig, + ); + this.post_processor = PostProcessor.fromConfig( + tokenizerJSON.post_processor, + ); + this.decoder = Decoder.fromConfig(tokenizerJSON.decoder); - // Add added_tokens to model - this.special_tokens = []; - this.all_special_ids = []; + // Add added_tokens to model + this.special_tokens = []; + this.all_special_ids = []; - /** @type {AddedToken[]} */ - this.added_tokens = []; - for (const addedToken of tokenizerJSON.added_tokens) { - const token = new AddedToken(addedToken); - this.added_tokens.push(token); + /** @type {AddedToken[]} */ + this.added_tokens = []; + for (const addedToken of tokenizerJSON.added_tokens) { + const token = new AddedToken(addedToken); + this.added_tokens.push(token); - this.model.tokens_to_ids.set(token.content, token.id); - this.model.vocab[token.id] = token.content; + this.model.tokens_to_ids.set(token.content, token.id); + this.model.vocab[token.id] = token.content; - if (token.special) { - this.special_tokens.push(token.content); - this.all_special_ids.push(token.id); - } - } - - // Update additional_special_tokens - this.additional_special_tokens = tokenizerConfig.additional_special_tokens ?? []; - this.special_tokens.push(...this.additional_special_tokens); - this.special_tokens = [...new Set(this.special_tokens)]; // Remove duplicates - - if (this.decoder) { - // Slight hack, but it prevents code duplication: - this.decoder.added_tokens = this.added_tokens; - - // Another slight hack to add `end_of_word_suffix` (if present) to the decoder - // This is needed for cases where BPE model and ByteLevel decoder are used - // For more information, see https://github.com/xenova/transformers.js/issues/74 - // TODO: save this to the decoder when exporting? - this.decoder.end_of_word_suffix = this.model.end_of_word_suffix; - } - - - this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp( - this.added_tokens.map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`).join('|') - ) : null; - - // Set mask token if present (otherwise will be undefined, which is fine) - this.mask_token = this.getToken('mask_token'); - this.mask_token_id = this.model.tokens_to_ids.get(this.mask_token); - - this.pad_token = this.getToken('pad_token', 'eos_token'); - this.pad_token_id = this.model.tokens_to_ids.get(this.pad_token); - - this.sep_token = this.getToken('sep_token'); - this.sep_token_id = this.model.tokens_to_ids.get(this.sep_token); - - this.unk_token = this.getToken(tokenizerConfig, 'unk_token'); - this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token); - - this.model_max_length = tokenizerConfig.model_max_length; - - /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */ - this.remove_space = tokenizerConfig.remove_space; - - this.clean_up_tokenization_spaces = tokenizerConfig.clean_up_tokenization_spaces ?? true; - this.do_lowercase_and_remove_accent = tokenizerConfig.do_lowercase_and_remove_accent ?? false; - - // TODO allow user to change this - /** @type {'right'|'left'} */ - this.padding_side = 'right'; - - this.legacy = false; - - this.chat_template = tokenizerConfig.chat_template ?? null; - this._compiled_template_cache = new Map(); + if (token.special) { + this.special_tokens.push(token.content); + this.all_special_ids.push(token.id); + } } - /** - * Returns the value of the first matching key in the tokenizer config object. - * @param {...string} keys One or more keys to search for in the tokenizer config object. - * @returns {string|null} The value associated with the first matching key, or null if no match is found. - * @throws {Error} If an object is found for a matching key and its __type property is not "AddedToken". - */ - getToken(...keys) { - for (const key of keys) { - const item = this._tokenizer_config[key]; + // Update additional_special_tokens + this.additional_special_tokens = + tokenizerConfig.additional_special_tokens ?? []; + this.special_tokens.push(...this.additional_special_tokens); + this.special_tokens = [...new Set(this.special_tokens)]; // Remove duplicates - if (!item) continue; + if (this.decoder) { + // Slight hack, but it prevents code duplication: + this.decoder.added_tokens = this.added_tokens; - if (typeof item === 'object') { - if (item.__type === 'AddedToken') { - return item.content; - } else { - throw Error(`Unknown token: ${item}`); - } - } else { - return item; - } - } - return null; + // Another slight hack to add `end_of_word_suffix` (if present) to the decoder + // This is needed for cases where BPE model and ByteLevel decoder are used + // For more information, see https://github.com/xenova/transformers.js/issues/74 + // TODO: save this to the decoder when exporting? + this.decoder.end_of_word_suffix = this.model.end_of_word_suffix; } - /** - * Loads a pre-trained tokenizer from the given `pretrained_model_name_or_path`. - * - * @param {string} pretrained_model_name_or_path The path to the pre-trained tokenizer. - * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. - * - * @throws {Error} Throws an error if the tokenizer.json or tokenizer_config.json files are not found in the `pretrained_model_name_or_path`. - * @returns {Promise} A new instance of the `PreTrainedTokenizer` class. - */ - static async from_pretrained(pretrained_model_name_or_path, { - progress_callback = null, - config = null, - cache_dir = null, - local_files_only = false, - revision = 'main', - legacy = null, - } = {}) { + this.added_tokens_regex = + this.added_tokens.length > 0 + ? new RegExp( + this.added_tokens + .map( + (x) => + `${x.lstrip ? "\\s*" : ""}(${escapeRegExp(x.content)})${x.rstrip ? "\\s*" : ""}`, + ) + .join("|"), + ) + : null; - const info = await loadTokenizer(pretrained_model_name_or_path, { - progress_callback, - config, - cache_dir, - local_files_only, - revision, - legacy, - }) + // Set mask token if present (otherwise will be undefined, which is fine) + this.mask_token = this.getToken("mask_token"); + this.mask_token_id = this.model.tokens_to_ids.get(this.mask_token); - // @ts-ignore - return new this(...info); - } + this.pad_token = this.getToken("pad_token", "eos_token"); + this.pad_token_id = this.model.tokens_to_ids.get(this.pad_token); - /** - * @typedef {number[]|number[][]|Tensor} BatchEncodingItem - * - * @typedef {Object} BatchEncoding Holds the output of the tokenizer's call function. - * @property {BatchEncodingItem} input_ids List of token ids to be fed to a model. - * @property {BatchEncodingItem} attention_mask List of indices specifying which tokens should be attended to by the model. - * @property {BatchEncodingItem} [token_type_ids] List of token type ids to be fed to a model. - */ + this.sep_token = this.getToken("sep_token"); + this.sep_token_id = this.model.tokens_to_ids.get(this.sep_token); - /** - * Encode/tokenize the given text(s). - * @param {string|string[]} text The text to tokenize. - * @param {Object} options An optional object containing the following properties: - * @param {string|string[]} [options.text_pair=null] Optional second sequence to be encoded. If set, must be the same type as text. - * @param {boolean|'max_length'} [options.padding=false] Whether to pad the input sequences. - * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. - * @param {boolean} [options.truncation=null] Whether to truncate the input sequences. - * @param {number} [options.max_length=null] Maximum length of the returned list and optionally padding length. - * @param {boolean} [options.return_tensor=true] Whether to return the results as Tensors or arrays. - * @returns {BatchEncoding} Object to be passed to the model. - */ - _call( - // Required positional arguments - text, + this.unk_token = this.getToken(tokenizerConfig, "unk_token"); + this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token); - // Optional keyword arguments - { - text_pair = null, - add_special_tokens = true, - padding = false, - truncation = null, - max_length = null, - return_tensor = true, // Different to HF - } = {}, - ) { + this.model_max_length = tokenizerConfig.model_max_length; - const isBatched = Array.isArray(text); + /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */ + this.remove_space = tokenizerConfig.remove_space; - /** @type {EncodingSingle[]} */ - let encodedTokens; + this.clean_up_tokenization_spaces = + tokenizerConfig.clean_up_tokenization_spaces ?? true; + this.do_lowercase_and_remove_accent = + tokenizerConfig.do_lowercase_and_remove_accent ?? false; - if (isBatched) { - if (text.length === 0) { - throw Error('text array must be non-empty') - } + // TODO allow user to change this + /** @type {'right'|'left'} */ + this.padding_side = "right"; - if (text_pair !== null) { - if (!Array.isArray(text_pair)) { - throw Error('text_pair must also be an array') + this.legacy = false; - } else if (text.length !== text_pair.length) { - throw Error('text and text_pair must have the same length') - } + this.chat_template = tokenizerConfig.chat_template ?? null; + this._compiled_template_cache = new Map(); + } - encodedTokens = text.map( - (t, i) => this._encode_plus(t, text_pair[i], { add_special_tokens }) - ) + /** + * Returns the value of the first matching key in the tokenizer config object. + * @param {...string} keys One or more keys to search for in the tokenizer config object. + * @returns {string|null} The value associated with the first matching key, or null if no match is found. + * @throws {Error} If an object is found for a matching key and its __type property is not "AddedToken". + */ + getToken(...keys) { + for (const key of keys) { + const item = this._tokenizer_config[key]; - } else { - encodedTokens = text.map(x => this._encode_plus(x, null, { add_special_tokens })); - } + if (!item) continue; + if (typeof item === "object") { + if (item.__type === "AddedToken") { + return item.content; } else { - if (text === null) { - throw Error('text may not be null') - } - - if (Array.isArray(text_pair)) { - throw Error('When specifying `text_pair`, since `text` is a string, `text_pair` must also be a string (i.e., not an array).') - } - - // For single input, we just wrap in an array, and then unwrap later. - encodedTokens = [this._encode_plus(text, text_pair, { add_special_tokens })]; + throw Error(`Unknown token: ${item}`); } - // At this point, tokens is batched: [batch_size, tokens] - // However, array may be jagged. So, we pad to max_length + } else { + return item; + } + } + return null; + } - if (max_length === null) { - if (padding === 'max_length') { - max_length = this.model_max_length; - } else { - // Calculate max length from sequences - max_length = max(encodedTokens.map(x => x.input_ids.length))[0]; - } + /** + * Loads a pre-trained tokenizer from the given `pretrained_model_name_or_path`. + * + * @param {string} pretrained_model_name_or_path The path to the pre-trained tokenizer. + * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. + * + * @throws {Error} Throws an error if the tokenizer.json or tokenizer_config.json files are not found in the `pretrained_model_name_or_path`. + * @returns {Promise} A new instance of the `PreTrainedTokenizer` class. + */ + static async from_pretrained( + pretrained_model_name_or_path, + { + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = "main", + legacy = null, + } = {}, + ) { + const info = await loadTokenizer(pretrained_model_name_or_path, { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + legacy, + }); + + // @ts-ignore + return new this(...info); + } + + /** + * @typedef {number[]|number[][]|Tensor} BatchEncodingItem + * + * @typedef {Object} BatchEncoding Holds the output of the tokenizer's call function. + * @property {BatchEncodingItem} input_ids List of token ids to be fed to a model. + * @property {BatchEncodingItem} attention_mask List of indices specifying which tokens should be attended to by the model. + * @property {BatchEncodingItem} [token_type_ids] List of token type ids to be fed to a model. + */ + + /** + * Encode/tokenize the given text(s). + * @param {string|string[]} text The text to tokenize. + * @param {Object} options An optional object containing the following properties: + * @param {string|string[]} [options.text_pair=null] Optional second sequence to be encoded. If set, must be the same type as text. + * @param {boolean|'max_length'} [options.padding=false] Whether to pad the input sequences. + * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. + * @param {boolean} [options.truncation=null] Whether to truncate the input sequences. + * @param {number} [options.max_length=null] Maximum length of the returned list and optionally padding length. + * @param {boolean} [options.return_tensor=true] Whether to return the results as Tensors or arrays. + * @returns {BatchEncoding} Object to be passed to the model. + */ + _call( + // Required positional arguments + text, + + // Optional keyword arguments + { + text_pair = null, + add_special_tokens = true, + padding = false, + truncation = null, + max_length = null, + return_tensor = true, // Different to HF + } = {}, + ) { + const isBatched = Array.isArray(text); + + /** @type {EncodingSingle[]} */ + let encodedTokens; + + if (isBatched) { + if (text.length === 0) { + throw Error("text array must be non-empty"); + } + + if (text_pair !== null) { + if (!Array.isArray(text_pair)) { + throw Error("text_pair must also be an array"); + } else if (text.length !== text_pair.length) { + throw Error("text and text_pair must have the same length"); + } + + encodedTokens = text.map((t, i) => + this._encode_plus(t, text_pair[i], { add_special_tokens }), + ); + } else { + encodedTokens = text.map((x) => + this._encode_plus(x, null, { add_special_tokens }), + ); + } + } else { + if (text === null) { + throw Error("text may not be null"); + } + + if (Array.isArray(text_pair)) { + throw Error( + "When specifying `text_pair`, since `text` is a string, `text_pair` must also be a string (i.e., not an array).", + ); + } + + // For single input, we just wrap in an array, and then unwrap later. + encodedTokens = [ + this._encode_plus(text, text_pair, { add_special_tokens }), + ]; + } + // At this point, tokens is batched: [batch_size, tokens] + // However, array may be jagged. So, we pad to max_length + + if (max_length === null) { + if (padding === "max_length") { + max_length = this.model_max_length; + } else { + // Calculate max length from sequences + max_length = max(encodedTokens.map((x) => x.input_ids.length))[0]; + } + } else { + if (!truncation) { + console.warn( + `Truncation was not explicitly activated but \`max_length\` is provided a specific value, please use \`truncation=true\` to explicitly truncate examples to max length.`, + ); + } + } + + // Ensure it is less than model max length + max_length = Math.min(max_length, this.model_max_length); + + if (padding || truncation) { + // Perform padding and/or truncation + for (let i = 0; i < encodedTokens.length; ++i) { + if (encodedTokens[i].input_ids.length === max_length) { + continue; + } else if (encodedTokens[i].input_ids.length > max_length) { + // possibly truncate + if (truncation) { + truncateHelper(encodedTokens[i], max_length); + } } else { - if (!truncation) { - console.warn(`Truncation was not explicitly activated but \`max_length\` is provided a specific value, please use \`truncation=true\` to explicitly truncate examples to max length.`) - } + // t.length < max_length + // possibly pad + if (padding) { + padHelper( + encodedTokens[i], + max_length, + (key) => (key === "input_ids" ? this.pad_token_id : 0), + this.padding_side, + ); + } } + } + } - // Ensure it is less than model max length - max_length = Math.min(max_length, this.model_max_length) + const result = {}; - if (padding || truncation) { + if (return_tensor) { + if (!(padding && truncation)) { + // Not, guaranteed that all items have same length, so + // we perform additional check - // Perform padding and/or truncation - for (let i = 0; i < encodedTokens.length; ++i) { - if (encodedTokens[i].input_ids.length === max_length) { - continue; - - } else if (encodedTokens[i].input_ids.length > max_length) { - // possibly truncate - if (truncation) { - truncateHelper(encodedTokens[i], max_length); - } - - } else { // t.length < max_length - // possibly pad - if (padding) { - padHelper( - encodedTokens[i], - max_length, - key => key === 'input_ids' ? this.pad_token_id : 0, - this.padding_side - ); - } - } + if ( + encodedTokens.some((x) => { + for (const key of Object.keys(x)) { + if (x[key].length !== encodedTokens[0][key]?.length) { + return true; + } } + return false; + }) + ) { + throw Error( + "Unable to create tensor, you should probably activate truncation and/or padding " + + "with 'padding=true' and 'truncation=true' to have batched tensors with the same length.", + ); } + } - const result = {}; + // Now we actually convert to tensor + // NOTE: In the same way as the python library, we return a batched tensor, regardless of + // whether we have a single input or multiple inputs. + const dims = [encodedTokens.length, encodedTokens[0].input_ids.length]; - if (return_tensor) { - if (!(padding && truncation)) { - // Not, guaranteed that all items have same length, so - // we perform additional check + for (const key of Object.keys(encodedTokens[0])) { + result[key] = new Tensor( + "int64", + BigInt64Array.from(encodedTokens.flatMap((x) => x[key]).map(BigInt)), + dims, + ); + } + } else { + for (const key of Object.keys(encodedTokens[0])) { + result[key] = encodedTokens.map((x) => x[key]); + } - if ( - encodedTokens.some(x => { - for (const key of Object.keys(x)) { - if (x[key].length !== encodedTokens[0][key]?.length) { - return true; - } - } - return false; - }) - ) { - throw Error( - "Unable to create tensor, you should probably activate truncation and/or padding " + - "with 'padding=true' and 'truncation=true' to have batched tensors with the same length." - ) - } - } + // If not returning a tensor, we match the input type + if (!isBatched) { + // Input was not batched, so we unwrap + for (const key of Object.keys(result)) { + result[key] = result[key][0]; + } + } + } - // Now we actually convert to tensor - // NOTE: In the same way as the python library, we return a batched tensor, regardless of - // whether we have a single input or multiple inputs. - const dims = [encodedTokens.length, encodedTokens[0].input_ids.length]; + return /** @type {BatchEncoding} */ (result); + } - for (const key of Object.keys(encodedTokens[0])) { - result[key] = new Tensor('int64', - BigInt64Array.from(encodedTokens.flatMap(x => x[key]).map(BigInt)), - dims - ); - } + /** + * Encodes a single text using the preprocessor pipeline of the tokenizer. + * + * @param {string|null} text The text to encode. + * @returns {string[]|null} The encoded tokens. + */ + _encode_text(text) { + if (text === null) return null; + // Actual function which does encoding, for a single text + // First, we take care of special tokens. Needed to avoid issues arising from + // normalization and/or pretokenization (which may not preserve special tokens) + const sections = this.added_tokens_regex + ? text.split(this.added_tokens_regex).filter((x) => x) + : [text]; + + const tokens = sections + .map((x, section_index) => { + const addedToken = this.added_tokens.find((t) => t.content === x); + if (addedToken !== undefined) { + // Ignore added tokens + return x; } else { - for (const key of Object.keys(encodedTokens[0])) { - result[key] = encodedTokens.map(x => x[key]); - } + if (this.remove_space === true) { + x = x.trim().split(/\s+/).join(" "); + } + if (this.do_lowercase_and_remove_accent) { + x = lowercase_and_remove_accent(x); + } - // If not returning a tensor, we match the input type - if (!isBatched) { - // Input was not batched, so we unwrap - for (const key of Object.keys(result)) { - result[key] = result[key][0]; - } - } + if (this.normalizer !== null) { + x = this.normalizer(x); + } + + const sectionTokens = + this.pre_tokenizer !== null + ? this.pre_tokenizer(x, { + section_index, + }) + : [x]; + + const tokens = this.model(sectionTokens); + + return tokens; } + }) + .flat(); - return /** @type {BatchEncoding} */(result); + return tokens; + } + + /** + * Encodes a single text or a pair of texts using the model's tokenizer. + * + * @param {string} text The text to encode. + * @param {string|null} text_pair The optional second text to encode. + * @param {Object} options An optional object containing the following properties: + * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. + * @returns {EncodingSingle} An object containing the encoded text. + * @private + */ + _encode_plus(text, text_pair = null, { add_special_tokens = true } = {}) { + // Function called by users to encode possibly multiple texts + const tokens = this._encode_text(text); + const tokens2 = this._encode_text(text_pair); + + const combinedTokens = this.post_processor + ? this.post_processor(tokens, tokens2, { add_special_tokens }) + : { tokens: mergeArrays(tokens ?? [], tokens2 ?? []) }; + + const input_ids = this.model.convert_tokens_to_ids(combinedTokens.tokens); + + const result = { + input_ids, + attention_mask: new Array(input_ids.length).fill(1), + }; + if (this.return_token_type_ids && combinedTokens.token_type_ids) { + result.token_type_ids = combinedTokens.token_type_ids; + } + return result; + } + + /** + * Encodes a single text or a pair of texts using the model's tokenizer. + * + * @param {string} text The text to encode. + * @param {string|null} text_pair The optional second text to encode. + * @param {Object} options An optional object containing the following properties: + * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. + * @returns {number[]} An array of token IDs representing the encoded text(s). + */ + encode(text, text_pair = null, { add_special_tokens = true } = {}) { + const { input_ids } = this._encode_plus(text, text_pair, { + add_special_tokens, + }); + return input_ids; + } + + /** + * Decode a batch of tokenized sequences. + * @param {number[][]|Tensor} batch List/Tensor of tokenized input sequences. + * @param {Object} decode_args (Optional) Object with decoding arguments. + * @returns {string[]} List of decoded sequences. + */ + batch_decode(batch, decode_args = {}) { + if (batch instanceof Tensor) { + batch = batch.tolist(); + } + return batch.map((x) => this.decode(x, decode_args)); + } + + /** + * Decodes a sequence of token IDs back to a string. + * + * @param {number[]|Tensor} token_ids List/Tensor of token IDs to decode. + * @param {Object} [decode_args={}] + * @param {boolean} [decode_args.skip_special_tokens=false] If true, special tokens are removed from the output string. + * @param {boolean} [decode_args.clean_up_tokenization_spaces=true] If true, spaces before punctuations and abbreviated forms are removed. + * + * @returns {string} The decoded string. + * @throws {Error} If `token_ids` is not a non-empty array of integers. + */ + decode(token_ids, decode_args = {}) { + if (token_ids instanceof Tensor) { + token_ids = prepareTensorForDecode(token_ids); } - /** - * Encodes a single text using the preprocessor pipeline of the tokenizer. - * - * @param {string|null} text The text to encode. - * @returns {string[]|null} The encoded tokens. - */ - _encode_text(text) { - if (text === null) return null; - - // Actual function which does encoding, for a single text - // First, we take care of special tokens. Needed to avoid issues arising from - // normalization and/or pretokenization (which may not preserve special tokens) - const sections = this.added_tokens_regex ? text.split(this.added_tokens_regex).filter(x => x) : [text]; - - const tokens = sections.map((x, section_index) => { - const addedToken = this.added_tokens.find(t => t.content === x); - if (addedToken !== undefined) { - // Ignore added tokens - return x - } else { - if (this.remove_space === true) { - x = x.trim().split(/\s+/).join(' '); - } - if (this.do_lowercase_and_remove_accent) { - x = lowercase_and_remove_accent(x); - } - - if (this.normalizer !== null) { - x = this.normalizer(x); - } - - const sectionTokens = (this.pre_tokenizer !== null) ? this.pre_tokenizer(x, { - section_index, - }) : [x]; - - const tokens = this.model(sectionTokens); - - return tokens; - } - }).flat(); - - return tokens; - } - - /** - * Encodes a single text or a pair of texts using the model's tokenizer. - * - * @param {string} text The text to encode. - * @param {string|null} text_pair The optional second text to encode. - * @param {Object} options An optional object containing the following properties: - * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. - * @returns {EncodingSingle} An object containing the encoded text. - * @private - */ - _encode_plus(text, text_pair = null, { - add_special_tokens = true, - } = {}) { - // Function called by users to encode possibly multiple texts - const tokens = this._encode_text(text); - const tokens2 = this._encode_text(text_pair); - - const combinedTokens = this.post_processor - ? this.post_processor(tokens, tokens2, { add_special_tokens }) - : { tokens: mergeArrays(tokens ?? [], tokens2 ?? []) }; - - const input_ids = this.model.convert_tokens_to_ids(combinedTokens.tokens); - - const result = { - input_ids, - attention_mask: new Array(input_ids.length).fill(1), - } - if (this.return_token_type_ids && combinedTokens.token_type_ids) { - result.token_type_ids = combinedTokens.token_type_ids; - } - return result; - } - - /** - * Encodes a single text or a pair of texts using the model's tokenizer. - * - * @param {string} text The text to encode. - * @param {string|null} text_pair The optional second text to encode. - * @param {Object} options An optional object containing the following properties: - * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. - * @returns {number[]} An array of token IDs representing the encoded text(s). - */ - encode(text, text_pair = null, { - add_special_tokens = true, - } = {}) { - const { input_ids } = this._encode_plus(text, text_pair, { - add_special_tokens, - }); - return input_ids; - } - - /** - * Decode a batch of tokenized sequences. - * @param {number[][]|Tensor} batch List/Tensor of tokenized input sequences. - * @param {Object} decode_args (Optional) Object with decoding arguments. - * @returns {string[]} List of decoded sequences. - */ - batch_decode(batch, decode_args = {}) { - if (batch instanceof Tensor) { - batch = batch.tolist(); - } - return batch.map(x => this.decode(x, decode_args)); - } - - /** - * Decodes a sequence of token IDs back to a string. - * - * @param {number[]|Tensor} token_ids List/Tensor of token IDs to decode. - * @param {Object} [decode_args={}] - * @param {boolean} [decode_args.skip_special_tokens=false] If true, special tokens are removed from the output string. - * @param {boolean} [decode_args.clean_up_tokenization_spaces=true] If true, spaces before punctuations and abbreviated forms are removed. - * - * @returns {string} The decoded string. - * @throws {Error} If `token_ids` is not a non-empty array of integers. - */ - decode( - token_ids, - decode_args = {}, + if ( + !Array.isArray(token_ids) || + token_ids.length === 0 || + !isIntegralNumber(token_ids[0]) ) { - if (token_ids instanceof Tensor) { - token_ids = prepareTensorForDecode(token_ids); - } - - if (!Array.isArray(token_ids) || token_ids.length === 0 || !isIntegralNumber(token_ids[0])) { - throw Error("token_ids must be a non-empty array of integers."); - } - - return this.decode_single(token_ids, decode_args) + throw Error("token_ids must be a non-empty array of integers."); } - /** - * Decode a single list of token ids to a string. - * @param {number[]} token_ids List of token ids to decode - * @param {Object} decode_args Optional arguments for decoding - * @param {boolean} [decode_args.skip_special_tokens=false] Whether to skip special tokens during decoding - * @param {boolean} [decode_args.clean_up_tokenization_spaces=null] Whether to clean up tokenization spaces during decoding. - * If null, the value is set to `this.decoder.cleanup` if it exists, falling back to `this.clean_up_tokenization_spaces` if it exists, falling back to `true`. - * @returns {string} The decoded string - */ - decode_single( - token_ids, - { - skip_special_tokens = false, - clean_up_tokenization_spaces = null, - } - ) { - let tokens = this.model.convert_ids_to_tokens(token_ids); - if (skip_special_tokens) { - tokens = tokens.filter(x => !this.special_tokens.includes(x)); - } + return this.decode_single(token_ids, decode_args); + } - // If `this.decoder` is null, we just join tokens with a space: - // https://github.com/huggingface/tokenizers/blob/8edec536a737cb04494b454805be16c020abb14f/tokenizers/src/tokenizer/mod.rs#L835 - /** @type {string} */ - let decoded = this.decoder ? this.decoder(tokens) : tokens.join(' '); - - // Slight hack, but prevents having to pass `skip_special_tokens` to - // each call to `decode`, which would lead to code duplication. - if (this.decoder && this.decoder.end_of_word_suffix) { - decoded = decoded.replaceAll(this.decoder.end_of_word_suffix, ' '); - if (skip_special_tokens) { - decoded = decoded.trim(); - } - } - - if (clean_up_tokenization_spaces ?? this.clean_up_tokenization_spaces) { - decoded = clean_up_tokenization(decoded); - } - - return decoded; + /** + * Decode a single list of token ids to a string. + * @param {number[]} token_ids List of token ids to decode + * @param {Object} decode_args Optional arguments for decoding + * @param {boolean} [decode_args.skip_special_tokens=false] Whether to skip special tokens during decoding + * @param {boolean} [decode_args.clean_up_tokenization_spaces=null] Whether to clean up tokenization spaces during decoding. + * If null, the value is set to `this.decoder.cleanup` if it exists, falling back to `this.clean_up_tokenization_spaces` if it exists, falling back to `true`. + * @returns {string} The decoded string + */ + decode_single( + token_ids, + { skip_special_tokens = false, clean_up_tokenization_spaces = null }, + ) { + let tokens = this.model.convert_ids_to_tokens(token_ids); + if (skip_special_tokens) { + tokens = tokens.filter((x) => !this.special_tokens.includes(x)); } - get default_chat_template() { - if (!this._warned_about_chat_template) { - console.warn( - "No chat template is defined for this tokenizer - using a default chat template " + - "that implements the ChatML format. If the default is not appropriate for " + - "your model, please set `tokenizer.chat_template` to an appropriate template. " + - "See https://huggingface.co/docs/transformers/main/chat_templating for more information." - ) - this._warned_about_chat_template = true; // TODO move to logger.warning_once() - } + // If `this.decoder` is null, we just join tokens with a space: + // https://github.com/huggingface/tokenizers/blob/8edec536a737cb04494b454805be16c020abb14f/tokenizers/src/tokenizer/mod.rs#L835 + /** @type {string} */ + let decoded = this.decoder ? this.decoder(tokens) : tokens.join(" "); - return this._default_chat_template; + // Slight hack, but prevents having to pass `skip_special_tokens` to + // each call to `decode`, which would lead to code duplication. + if (this.decoder && this.decoder.end_of_word_suffix) { + decoded = decoded.replaceAll(this.decoder.end_of_word_suffix, " "); + if (skip_special_tokens) { + decoded = decoded.trim(); + } } - /** - * @typedef {Object} Message - * @property {string} role The role of the message (e.g., "user" or "assistant" or "system"). - * @property {string} content The content of the message. - */ - - /** - * Converts a list of message objects with `"role"` and `"content"` keys to a list of token - * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to - * determine the format and control tokens to use when converting. When chat_template is None, it will fall back - * to the default_chat_template specified at the class level. - * - * See [here](https://huggingface.co/docs/transformers/chat_templating) for more information. - * - * **Example:** Applying a chat template to a conversation. - * - * ```javascript - * import { AutoTokenizer } from "@xenova/transformers"; - * - * const tokenizer = await AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1"); - * - * const chat = [ - * { "role": "user", "content": "Hello, how are you?" }, - * { "role": "assistant", "content": "I'm doing great. How can I help you today?" }, - * { "role": "user", "content": "I'd like to show off how chat templating works!" }, - * ] - * - * const text = tokenizer.apply_chat_template(chat, { tokenize: false }); - * // "[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]" - * - * const input_ids = tokenizer.apply_chat_template(chat, { tokenize: true, return_tensor: false }); - * // [1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793] - * ``` - * - * @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys. - * @param {Object} options An optional object containing the following properties: - * @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If - * this is not passed, the model's default chat template will be used instead. - * @param {boolean} [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate - * the start of an assistant message. This is useful when you want to generate a response from the model. - * Note that this argument will be passed to the chat template, and so it must be supported in the - * template for this argument to have any effect. - * @param {boolean} [options.tokenize=true] Whether to tokenize the output. If false, the output will be a string. - * @param {boolean} [options.padding=false] Whether to pad sequences to the maximum length. Has no effect if tokenize is false. - * @param {boolean} [options.truncation=false] Whether to truncate sequences to the maximum length. Has no effect if tokenize is false. - * @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false. - * If not specified, the tokenizer's `max_length` attribute will be used as a default. - * @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false. - * @returns {string | Tensor | number[]| number[][]} The tokenized output. - */ - apply_chat_template(conversation, { - chat_template = null, - add_generation_prompt = false, - tokenize = true, - padding = false, - truncation = false, - max_length = null, - return_tensor = true, - } = {}) { - - chat_template ??= this.chat_template ?? this.default_chat_template; - - // Compilation function uses a cache to avoid recompiling the same template - let compiledTemplate = this._compiled_template_cache.get(chat_template); - if (compiledTemplate === undefined) { - compiledTemplate = new Template(chat_template); - this._compiled_template_cache.set(chat_template, compiledTemplate); - } - - const special_tokens_map = Object.create(null); - for (const key of SPECIAL_TOKEN_ATTRIBUTES) { - const value = this.getToken(key); - if (value) { - special_tokens_map[key] = value; - } - } - - const rendered = compiledTemplate.render({ - messages: conversation, - add_generation_prompt: add_generation_prompt, - - ...special_tokens_map, - }); - - if (tokenize) { - return this._call(rendered, { - add_special_tokens: false, - padding, - truncation, - max_length, - return_tensor, - }).input_ids; - } - - return rendered; + if (clean_up_tokenization_spaces ?? this.clean_up_tokenization_spaces) { + decoded = clean_up_tokenization(decoded); } + + return decoded; + } + + get default_chat_template() { + if (!this._warned_about_chat_template) { + console.warn( + "No chat template is defined for this tokenizer - using a default chat template " + + "that implements the ChatML format. If the default is not appropriate for " + + "your model, please set `tokenizer.chat_template` to an appropriate template. " + + "See https://huggingface.co/docs/transformers/main/chat_templating for more information.", + ); + this._warned_about_chat_template = true; // TODO move to logger.warning_once() + } + + return this._default_chat_template; + } + + /** + * @typedef {Object} Message + * @property {string} role The role of the message (e.g., "user" or "assistant" or "system"). + * @property {string} content The content of the message. + */ + + /** + * Converts a list of message objects with `"role"` and `"content"` keys to a list of token + * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to + * determine the format and control tokens to use when converting. When chat_template is None, it will fall back + * to the default_chat_template specified at the class level. + * + * See [here](https://huggingface.co/docs/transformers/chat_templating) for more information. + * + * **Example:** Applying a chat template to a conversation. + * + * ```javascript + * import { AutoTokenizer } from "@xenova/transformers"; + * + * const tokenizer = await AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1"); + * + * const chat = [ + * { "role": "user", "content": "Hello, how are you?" }, + * { "role": "assistant", "content": "I'm doing great. How can I help you today?" }, + * { "role": "user", "content": "I'd like to show off how chat templating works!" }, + * ] + * + * const text = tokenizer.apply_chat_template(chat, { tokenize: false }); + * // "[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]" + * + * const input_ids = tokenizer.apply_chat_template(chat, { tokenize: true, return_tensor: false }); + * // [1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793] + * ``` + * + * @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys. + * @param {Object} options An optional object containing the following properties: + * @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If + * this is not passed, the model's default chat template will be used instead. + * @param {boolean} [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate + * the start of an assistant message. This is useful when you want to generate a response from the model. + * Note that this argument will be passed to the chat template, and so it must be supported in the + * template for this argument to have any effect. + * @param {boolean} [options.tokenize=true] Whether to tokenize the output. If false, the output will be a string. + * @param {boolean} [options.padding=false] Whether to pad sequences to the maximum length. Has no effect if tokenize is false. + * @param {boolean} [options.truncation=false] Whether to truncate sequences to the maximum length. Has no effect if tokenize is false. + * @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false. + * If not specified, the tokenizer's `max_length` attribute will be used as a default. + * @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false. + * @returns {string | Tensor | number[]| number[][]} The tokenized output. + */ + apply_chat_template( + conversation, + { + chat_template = null, + add_generation_prompt = false, + tokenize = true, + padding = false, + truncation = false, + max_length = null, + return_tensor = true, + } = {}, + ) { + chat_template ??= this.chat_template ?? this.default_chat_template; + + // Compilation function uses a cache to avoid recompiling the same template + let compiledTemplate = this._compiled_template_cache.get(chat_template); + if (compiledTemplate === undefined) { + compiledTemplate = new Template(chat_template); + this._compiled_template_cache.set(chat_template, compiledTemplate); + } + + const special_tokens_map = Object.create(null); + for (const key of SPECIAL_TOKEN_ATTRIBUTES) { + const value = this.getToken(key); + if (value) { + special_tokens_map[key] = value; + } + } + + const rendered = compiledTemplate.render({ + messages: conversation, + add_generation_prompt: add_generation_prompt, + + ...special_tokens_map, + }); + + if (tokenize) { + return this._call(rendered, { + add_special_tokens: false, + padding, + truncation, + max_length, + return_tensor, + }).input_ids; + } + + return rendered; + } } /** @@ -3006,157 +3064,187 @@ export class PreTrainedTokenizer extends Callable { * @extends PreTrainedTokenizer */ export class BertTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } /** * Albert tokenizer * @extends PreTrainedTokenizer */ export class AlbertTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } export class MobileBertTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } export class SqueezeBertTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } export class DebertaTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } export class DebertaV2Tokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } export class HerbertTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } export class ConvBertTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } export class RoFormerTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } -export class DistilBertTokenizer extends PreTrainedTokenizer { } -export class CamembertTokenizer extends PreTrainedTokenizer { } +export class DistilBertTokenizer extends PreTrainedTokenizer {} +export class CamembertTokenizer extends PreTrainedTokenizer {} export class XLMTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; - constructor(tokenizerJSON, tokenizerConfig) { - super(tokenizerJSON, tokenizerConfig); - console.warn('WARNING: `XLMTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.') - } + constructor(tokenizerJSON, tokenizerConfig) { + super(tokenizerJSON, tokenizerConfig); + console.warn( + 'WARNING: `XLMTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.', + ); + } } export class ElectraTokenizer extends PreTrainedTokenizer { - return_token_type_ids = true; + return_token_type_ids = true; } -export class T5Tokenizer extends PreTrainedTokenizer { } +export class T5Tokenizer extends PreTrainedTokenizer {} export class GPT2Tokenizer extends PreTrainedTokenizer { - _default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}` + _default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`; } -export class BartTokenizer extends PreTrainedTokenizer { } +export class BartTokenizer extends PreTrainedTokenizer {} export class MBartTokenizer extends PreTrainedTokenizer { - constructor(tokenizerJSON, tokenizerConfig) { - super(tokenizerJSON, tokenizerConfig); + constructor(tokenizerJSON, tokenizerConfig) { + super(tokenizerJSON, tokenizerConfig); - this.languageRegex = /^[a-z]{2}_[A-Z]{2}$/; - this.language_codes = this.special_tokens.filter(x => this.languageRegex.test(x)); - this.lang_to_token = x => x; // Identity function - } + this.languageRegex = /^[a-z]{2}_[A-Z]{2}$/; + this.language_codes = this.special_tokens.filter((x) => + this.languageRegex.test(x), + ); + this.lang_to_token = (x) => x; // Identity function + } - /** - * Helper function to build translation inputs for an `MBartTokenizer`. - * @param {string|string[]} raw_inputs The text to tokenize. - * @param {Object} tokenizer_options Options to be sent to the tokenizer - * @param {Object} generate_kwargs Generation options. - * @returns {Object} Object to be passed to the model. - */ - _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs) { - return _build_translation_inputs(this, raw_inputs, tokenizer_options, generate_kwargs); - } + /** + * Helper function to build translation inputs for an `MBartTokenizer`. + * @param {string|string[]} raw_inputs The text to tokenize. + * @param {Object} tokenizer_options Options to be sent to the tokenizer + * @param {Object} generate_kwargs Generation options. + * @returns {Object} Object to be passed to the model. + */ + _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs) { + return _build_translation_inputs( + this, + raw_inputs, + tokenizer_options, + generate_kwargs, + ); + } } -export class MBart50Tokenizer extends MBartTokenizer { } // NOTE: extends MBartTokenizer +export class MBart50Tokenizer extends MBartTokenizer {} // NOTE: extends MBartTokenizer -export class RobertaTokenizer extends PreTrainedTokenizer { } +export class RobertaTokenizer extends PreTrainedTokenizer {} -export class BloomTokenizer extends GPT2Tokenizer { // NOTE: `GPT2Tokenizer` to get the correct chat template +export class BloomTokenizer extends GPT2Tokenizer { + // NOTE: `GPT2Tokenizer` to get the correct chat template - constructor(tokenizerJSON, tokenizerConfig) { - // Override the default (invalid) regex of the pretokenizer. - // For more information, see https://github.com/xenova/transformers.js/issues/94 - const splitChars = '.,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c'; - const patternObject = tokenizerJSON.pre_tokenizer?.pretokenizers[0]?.pattern; - if (patternObject && patternObject.Regex === ` ?[^(\\s|[${splitChars}])]+`) { - patternObject.Regex = ` ?[^\\s${splitChars}]+`; - } - super(tokenizerJSON, tokenizerConfig); + constructor(tokenizerJSON, tokenizerConfig) { + // Override the default (invalid) regex of the pretokenizer. + // For more information, see https://github.com/xenova/transformers.js/issues/94 + const splitChars = ".,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c"; + const patternObject = + tokenizerJSON.pre_tokenizer?.pretokenizers[0]?.pattern; + if ( + patternObject && + patternObject.Regex === ` ?[^(\\s|[${splitChars}])]+` + ) { + patternObject.Regex = ` ?[^\\s${splitChars}]+`; } + super(tokenizerJSON, tokenizerConfig); + } } const SPIECE_UNDERLINE = "โ–"; export class LlamaTokenizer extends PreTrainedTokenizer { - _default_chat_template = `{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\n' + content.strip() + '\n<>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}` + _default_chat_template = `{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\n' + content.strip() + '\n<>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}`; - DEFAULT_SYSTEM_PROMPT = - "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your " + - "answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure " + - "that your responses are socially unbiased and positive in nature.\n\n" + - "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not " + - "correct. If you don't know the answer to a question, please don't share false information." + DEFAULT_SYSTEM_PROMPT = + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your " + + "answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure " + + "that your responses are socially unbiased and positive in nature.\n\n" + + "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not " + + "correct. If you don't know the answer to a question, please don't share false information."; - constructor(tokenizerJSON, tokenizerConfig) { - super(tokenizerJSON, tokenizerConfig); - this.use_default_system_prompt = tokenizerConfig.use_default_system_prompt ?? false; + constructor(tokenizerJSON, tokenizerConfig) { + super(tokenizerJSON, tokenizerConfig); + this.use_default_system_prompt = + tokenizerConfig.use_default_system_prompt ?? false; - this.legacy = tokenizerConfig.legacy ?? true; - if (!this.legacy) { - // See https://github.com/huggingface/transformers/pull/24565 for more information - this.normalizer = null; - this.pre_tokenizer = new MetaspacePreTokenizer({ - replacement: SPIECE_UNDERLINE, - add_prefix_space: true, - prepend_scheme: "first", - }); - } + this.legacy = tokenizerConfig.legacy ?? true; + if (!this.legacy) { + // See https://github.com/huggingface/transformers/pull/24565 for more information + this.normalizer = null; + this.pre_tokenizer = new MetaspacePreTokenizer({ + replacement: SPIECE_UNDERLINE, + add_prefix_space: true, + prepend_scheme: "first", + }); + } + } + + /** + * Helper function to handle legacy encoding of SPM tokenizers. + * Adapted from https://github.com/huggingface/transformers/blob/e6dcf8abd6f65bb4b6dfc1831b20d9ba49ce00e2/src/transformers/models/t5/tokenization_t5.py#L374-L387 + * @param {string} text The text to encode. + * @returns {string[]} The encoded tokens. + */ + _encode_text(text) { + if (text === null) return null; + + if (this.legacy || text.length === 0) { + return super._encode_text(text); } - /** - * Helper function to handle legacy encoding of SPM tokenizers. - * Adapted from https://github.com/huggingface/transformers/blob/e6dcf8abd6f65bb4b6dfc1831b20d9ba49ce00e2/src/transformers/models/t5/tokenization_t5.py#L374-L387 - * @param {string} text The text to encode. - * @returns {string[]} The encoded tokens. - */ - _encode_text(text) { - if (text === null) return null; - - if (this.legacy || text.length === 0) { - return super._encode_text(text); - } - - let tokens = super._encode_text(SPIECE_UNDERLINE + text.replaceAll(SPIECE_UNDERLINE, " ")); - if (tokens.length > 1 && tokens[0] === SPIECE_UNDERLINE && this.special_tokens.includes(tokens[1])) { - tokens = tokens.slice(1); - } - return tokens; + let tokens = super._encode_text( + SPIECE_UNDERLINE + text.replaceAll(SPIECE_UNDERLINE, " "), + ); + if ( + tokens.length > 1 && + tokens[0] === SPIECE_UNDERLINE && + this.special_tokens.includes(tokens[1]) + ) { + tokens = tokens.slice(1); } + return tokens; + } - get default_chat_template() { - return super.default_chat_template - .replaceAll('USE_DEFAULT_PROMPT', this.use_default_system_prompt ? 'true' : 'false') - .replaceAll('DEFAULT_SYSTEM_MESSAGE', this.DEFAULT_SYSTEM_PROMPT.replaceAll("\n", "\\n").replaceAll("'", "\\'")); - } + get default_chat_template() { + return super.default_chat_template + .replaceAll( + "USE_DEFAULT_PROMPT", + this.use_default_system_prompt ? "true" : "false", + ) + .replaceAll( + "DEFAULT_SYSTEM_MESSAGE", + this.DEFAULT_SYSTEM_PROMPT.replaceAll("\n", "\\n").replaceAll( + "'", + "\\'", + ), + ); + } } -export class CodeLlamaTokenizer extends LlamaTokenizer { } // NOTE: `LlamaTokenizer` to get the correct chat template +export class CodeLlamaTokenizer extends LlamaTokenizer {} // NOTE: `LlamaTokenizer` to get the correct chat template -export class XLMRobertaTokenizer extends PreTrainedTokenizer { } -export class MPNetTokenizer extends PreTrainedTokenizer { } +export class XLMRobertaTokenizer extends PreTrainedTokenizer {} +export class MPNetTokenizer extends PreTrainedTokenizer {} -export class FalconTokenizer extends PreTrainedTokenizer { } +export class FalconTokenizer extends PreTrainedTokenizer {} -export class GPTNeoXTokenizer extends PreTrainedTokenizer { } +export class GPTNeoXTokenizer extends PreTrainedTokenizer {} -export class EsmTokenizer extends PreTrainedTokenizer { } +export class EsmTokenizer extends PreTrainedTokenizer {} /** * Helper function to build translation inputs for an `NllbTokenizer` or `M2M100Tokenizer`. @@ -3167,237 +3255,267 @@ export class EsmTokenizer extends PreTrainedTokenizer { } * @returns {Object} Object to be passed to the model. * @private */ -function _build_translation_inputs(self, raw_inputs, tokenizer_options, generate_kwargs) { - if (!('language_codes' in self) || !Array.isArray(self.language_codes)) { - throw new Error('Tokenizer must have `language_codes` attribute set and it should be an array of language ids.') - } - if (!('languageRegex' in self) || !(self.languageRegex instanceof RegExp)) { - throw new Error('Tokenizer must have `languageRegex` attribute set and it should be a regular expression.') - } - if (!('lang_to_token' in self) || typeof self.lang_to_token !== 'function') { - throw new Error('Tokenizer must have `lang_to_token` attribute set and it should be a function.') - } - const src_lang_token = generate_kwargs.src_lang; - const tgt_lang_token = generate_kwargs.tgt_lang; +function _build_translation_inputs( + self, + raw_inputs, + tokenizer_options, + generate_kwargs, +) { + if (!("language_codes" in self) || !Array.isArray(self.language_codes)) { + throw new Error( + "Tokenizer must have `language_codes` attribute set and it should be an array of language ids.", + ); + } + if (!("languageRegex" in self) || !(self.languageRegex instanceof RegExp)) { + throw new Error( + "Tokenizer must have `languageRegex` attribute set and it should be a regular expression.", + ); + } + if (!("lang_to_token" in self) || typeof self.lang_to_token !== "function") { + throw new Error( + "Tokenizer must have `lang_to_token` attribute set and it should be a function.", + ); + } + const src_lang_token = generate_kwargs.src_lang; + const tgt_lang_token = generate_kwargs.tgt_lang; - // Check that the target language is valid: - if (!self.language_codes.includes(tgt_lang_token)) { - throw new Error(`Target language code "${tgt_lang_token}" is not valid. Must be one of: {${self.language_codes.join(', ')}}`); + // Check that the target language is valid: + if (!self.language_codes.includes(tgt_lang_token)) { + throw new Error( + `Target language code "${tgt_lang_token}" is not valid. Must be one of: {${self.language_codes.join(", ")}}`, + ); + } + + // Allow `src_lang` to be optional. If not set, we'll use the tokenizer's default. + if (src_lang_token !== undefined) { + // Check that the source language is valid: + if (!self.language_codes.includes(src_lang_token)) { + throw new Error( + `Source language code "${src_lang_token}" is not valid. Must be one of: {${self.language_codes.join(", ")}}`, + ); } - // Allow `src_lang` to be optional. If not set, we'll use the tokenizer's default. - if (src_lang_token !== undefined) { - // Check that the source language is valid: - if (!self.language_codes.includes(src_lang_token)) { - throw new Error(`Source language code "${src_lang_token}" is not valid. Must be one of: {${self.language_codes.join(', ')}}`); - } - - // In the same way as the Python library, we override the post-processor - // to force the source language to be first: - for (const item of self.post_processor.config.single) { - if ('SpecialToken' in item && self.languageRegex.test(item.SpecialToken.id)) { - item.SpecialToken.id = self.lang_to_token(src_lang_token); - break; - } - } - // TODO: Do the same for pair? + // In the same way as the Python library, we override the post-processor + // to force the source language to be first: + for (const item of self.post_processor.config.single) { + if ( + "SpecialToken" in item && + self.languageRegex.test(item.SpecialToken.id) + ) { + item.SpecialToken.id = self.lang_to_token(src_lang_token); + break; + } } + // TODO: Do the same for pair? + } - // Override the `forced_bos_token_id` to force the correct language - generate_kwargs.forced_bos_token_id = self.model.convert_tokens_to_ids([self.lang_to_token(tgt_lang_token)])[0]; + // Override the `forced_bos_token_id` to force the correct language + generate_kwargs.forced_bos_token_id = self.model.convert_tokens_to_ids([ + self.lang_to_token(tgt_lang_token), + ])[0]; - return self._call(raw_inputs, tokenizer_options); + return self._call(raw_inputs, tokenizer_options); } /** * The NllbTokenizer class is used to tokenize text for NLLB ("No Language Left Behind") models. - * + * * No Language Left Behind (NLLB) is a first-of-its-kind, AI breakthrough project * that open-sources models capable of delivering high-quality translations directly * between any pair of 200+ languages โ€” including low-resource languages like Asturian, * Luganda, Urdu and more. It aims to help people communicate with anyone, anywhere, * regardless of their language preferences. For more information, check out their * [paper](https://arxiv.org/abs/2207.04672). - * + * * For a list of supported languages (along with their language codes), * @see {@link https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200} */ export class NllbTokenizer extends PreTrainedTokenizer { + constructor(tokenizerJSON, tokenizerConfig) { + super(tokenizerJSON, tokenizerConfig); - constructor(tokenizerJSON, tokenizerConfig) { - super(tokenizerJSON, tokenizerConfig); + this.languageRegex = /^[a-z]{3}_[A-Z][a-z]{3}$/; + this.language_codes = this.special_tokens.filter((x) => + this.languageRegex.test(x), + ); + this.lang_to_token = (x) => x; // Identity function + } - this.languageRegex = /^[a-z]{3}_[A-Z][a-z]{3}$/; - this.language_codes = this.special_tokens.filter(x => this.languageRegex.test(x)); - this.lang_to_token = x => x; // Identity function - } - - /** - * Helper function to build translation inputs for an `NllbTokenizer`. - * @param {string|string[]} raw_inputs The text to tokenize. - * @param {Object} tokenizer_options Options to be sent to the tokenizer - * @param {Object} generate_kwargs Generation options. - * @returns {Object} Object to be passed to the model. - */ - _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs) { - return _build_translation_inputs(this, raw_inputs, tokenizer_options, generate_kwargs); - } + /** + * Helper function to build translation inputs for an `NllbTokenizer`. + * @param {string|string[]} raw_inputs The text to tokenize. + * @param {Object} tokenizer_options Options to be sent to the tokenizer + * @param {Object} generate_kwargs Generation options. + * @returns {Object} Object to be passed to the model. + */ + _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs) { + return _build_translation_inputs( + this, + raw_inputs, + tokenizer_options, + generate_kwargs, + ); + } } /** * The M2M100Tokenizer class is used to tokenize text for M2M100 ("Many-to-Many") models. - * + * * M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many * multilingual translation. It was introduced in this [paper](https://arxiv.org/abs/2010.11125) * and first released in [this](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100) repository. - * + * * For a list of supported languages (along with their language codes), * @see {@link https://huggingface.co/facebook/m2m100_418M#languages-covered} */ export class M2M100Tokenizer extends PreTrainedTokenizer { - constructor(tokenizerJSON, tokenizerConfig) { - super(tokenizerJSON, tokenizerConfig); + constructor(tokenizerJSON, tokenizerConfig) { + super(tokenizerJSON, tokenizerConfig); - this.languageRegex = /^__[a-z]{2,3}__$/; - this.language_codes = this.special_tokens - .filter(x => this.languageRegex.test(x)) - .map(x => x.slice(2, -2)); - this.lang_to_token = x => `__${x}__`; - } + this.languageRegex = /^__[a-z]{2,3}__$/; + this.language_codes = this.special_tokens + .filter((x) => this.languageRegex.test(x)) + .map((x) => x.slice(2, -2)); + this.lang_to_token = (x) => `__${x}__`; + } - /** - * Helper function to build translation inputs for an `M2M100Tokenizer`. - * @param {string|string[]} raw_inputs The text to tokenize. - * @param {Object} tokenizer_options Options to be sent to the tokenizer - * @param {Object} generate_kwargs Generation options. - * @returns {Object} Object to be passed to the model. - */ - _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs) { - return _build_translation_inputs(this, raw_inputs, tokenizer_options, generate_kwargs); - } + /** + * Helper function to build translation inputs for an `M2M100Tokenizer`. + * @param {string|string[]} raw_inputs The text to tokenize. + * @param {Object} tokenizer_options Options to be sent to the tokenizer + * @param {Object} generate_kwargs Generation options. + * @returns {Object} Object to be passed to the model. + */ + _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs) { + return _build_translation_inputs( + this, + raw_inputs, + tokenizer_options, + generate_kwargs, + ); + } } - const WHISPER_LANGUAGES = [ - ["en", "english"], - ["zh", "chinese"], - ["de", "german"], - ["es", "spanish"], - ["ru", "russian"], - ["ko", "korean"], - ["fr", "french"], - ["ja", "japanese"], - ["pt", "portuguese"], - ["tr", "turkish"], - ["pl", "polish"], - ["ca", "catalan"], - ["nl", "dutch"], - ["ar", "arabic"], - ["sv", "swedish"], - ["it", "italian"], - ["id", "indonesian"], - ["hi", "hindi"], - ["fi", "finnish"], - ["vi", "vietnamese"], - ["he", "hebrew"], - ["uk", "ukrainian"], - ["el", "greek"], - ["ms", "malay"], - ["cs", "czech"], - ["ro", "romanian"], - ["da", "danish"], - ["hu", "hungarian"], - ["ta", "tamil"], - ["no", "norwegian"], - ["th", "thai"], - ["ur", "urdu"], - ["hr", "croatian"], - ["bg", "bulgarian"], - ["lt", "lithuanian"], - ["la", "latin"], - ["mi", "maori"], - ["ml", "malayalam"], - ["cy", "welsh"], - ["sk", "slovak"], - ["te", "telugu"], - ["fa", "persian"], - ["lv", "latvian"], - ["bn", "bengali"], - ["sr", "serbian"], - ["az", "azerbaijani"], - ["sl", "slovenian"], - ["kn", "kannada"], - ["et", "estonian"], - ["mk", "macedonian"], - ["br", "breton"], - ["eu", "basque"], - ["is", "icelandic"], - ["hy", "armenian"], - ["ne", "nepali"], - ["mn", "mongolian"], - ["bs", "bosnian"], - ["kk", "kazakh"], - ["sq", "albanian"], - ["sw", "swahili"], - ["gl", "galician"], - ["mr", "marathi"], - ["pa", "punjabi"], - ["si", "sinhala"], - ["km", "khmer"], - ["sn", "shona"], - ["yo", "yoruba"], - ["so", "somali"], - ["af", "afrikaans"], - ["oc", "occitan"], - ["ka", "georgian"], - ["be", "belarusian"], - ["tg", "tajik"], - ["sd", "sindhi"], - ["gu", "gujarati"], - ["am", "amharic"], - ["yi", "yiddish"], - ["lo", "lao"], - ["uz", "uzbek"], - ["fo", "faroese"], - ["ht", "haitian creole"], - ["ps", "pashto"], - ["tk", "turkmen"], - ["nn", "nynorsk"], - ["mt", "maltese"], - ["sa", "sanskrit"], - ["lb", "luxembourgish"], - ["my", "myanmar"], - ["bo", "tibetan"], - ["tl", "tagalog"], - ["mg", "malagasy"], - ["as", "assamese"], - ["tt", "tatar"], - ["haw", "hawaiian"], - ["ln", "lingala"], - ["ha", "hausa"], - ["ba", "bashkir"], - ["jw", "javanese"], - ["su", "sundanese"], -] + ["en", "english"], + ["zh", "chinese"], + ["de", "german"], + ["es", "spanish"], + ["ru", "russian"], + ["ko", "korean"], + ["fr", "french"], + ["ja", "japanese"], + ["pt", "portuguese"], + ["tr", "turkish"], + ["pl", "polish"], + ["ca", "catalan"], + ["nl", "dutch"], + ["ar", "arabic"], + ["sv", "swedish"], + ["it", "italian"], + ["id", "indonesian"], + ["hi", "hindi"], + ["fi", "finnish"], + ["vi", "vietnamese"], + ["he", "hebrew"], + ["uk", "ukrainian"], + ["el", "greek"], + ["ms", "malay"], + ["cs", "czech"], + ["ro", "romanian"], + ["da", "danish"], + ["hu", "hungarian"], + ["ta", "tamil"], + ["no", "norwegian"], + ["th", "thai"], + ["ur", "urdu"], + ["hr", "croatian"], + ["bg", "bulgarian"], + ["lt", "lithuanian"], + ["la", "latin"], + ["mi", "maori"], + ["ml", "malayalam"], + ["cy", "welsh"], + ["sk", "slovak"], + ["te", "telugu"], + ["fa", "persian"], + ["lv", "latvian"], + ["bn", "bengali"], + ["sr", "serbian"], + ["az", "azerbaijani"], + ["sl", "slovenian"], + ["kn", "kannada"], + ["et", "estonian"], + ["mk", "macedonian"], + ["br", "breton"], + ["eu", "basque"], + ["is", "icelandic"], + ["hy", "armenian"], + ["ne", "nepali"], + ["mn", "mongolian"], + ["bs", "bosnian"], + ["kk", "kazakh"], + ["sq", "albanian"], + ["sw", "swahili"], + ["gl", "galician"], + ["mr", "marathi"], + ["pa", "punjabi"], + ["si", "sinhala"], + ["km", "khmer"], + ["sn", "shona"], + ["yo", "yoruba"], + ["so", "somali"], + ["af", "afrikaans"], + ["oc", "occitan"], + ["ka", "georgian"], + ["be", "belarusian"], + ["tg", "tajik"], + ["sd", "sindhi"], + ["gu", "gujarati"], + ["am", "amharic"], + ["yi", "yiddish"], + ["lo", "lao"], + ["uz", "uzbek"], + ["fo", "faroese"], + ["ht", "haitian creole"], + ["ps", "pashto"], + ["tk", "turkmen"], + ["nn", "nynorsk"], + ["mt", "maltese"], + ["sa", "sanskrit"], + ["lb", "luxembourgish"], + ["my", "myanmar"], + ["bo", "tibetan"], + ["tl", "tagalog"], + ["mg", "malagasy"], + ["as", "assamese"], + ["tt", "tatar"], + ["haw", "hawaiian"], + ["ln", "lingala"], + ["ha", "hausa"], + ["ba", "bashkir"], + ["jw", "javanese"], + ["su", "sundanese"], +]; // @ts-ignore const WHISPER_LANGUAGE_MAPPING = new Map(WHISPER_LANGUAGES); // @ts-ignore const WHISPER_TO_LANGUAGE_CODE_MAPPING = new Map([ - ...WHISPER_LANGUAGES.map(([k, v]) => [v, k]), - ...[ - ["burmese", "my"], - ["valencian", "ca"], - ["flemish", "nl"], - ["haitian", "ht"], - ["letzeburgesch", "lb"], - ["pushto", "ps"], - ["panjabi", "pa"], - ["moldavian", "ro"], - ["moldovan", "ro"], - ["sinhalese", "si"], - ["castilian", "es"], - ] + ...WHISPER_LANGUAGES.map(([k, v]) => [v, k]), + ...[ + ["burmese", "my"], + ["valencian", "ca"], + ["flemish", "nl"], + ["haitian", "ht"], + ["letzeburgesch", "lb"], + ["pushto", "ps"], + ["panjabi", "pa"], + ["moldavian", "ro"], + ["moldovan", "ro"], + ["sinhalese", "si"], + ["castilian", "es"], + ], ]); /** @@ -3405,912 +3523,977 @@ const WHISPER_TO_LANGUAGE_CODE_MAPPING = new Map([ * @extends PreTrainedTokenizer */ export class WhisperTokenizer extends PreTrainedTokenizer { - _default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`; + _default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`; - /** - * Decodes automatic speech recognition (ASR) sequences. - * @param {Array<{tokens: number[], token_timestamps?: number[], stride: number[]}>} sequences The sequences to decode. - * @param {Object} options The options to use for decoding. - * @returns {Array, text: string}>}>} The decoded sequences. - */ - _decode_asr(sequences, { - return_timestamps = false, - return_language = false, - time_precision = null, - force_full_sequences = true - } = {}) { - // Set force_full_sequences=false if you want streaming - // TODO add support for `return_language` + /** + * Decodes automatic speech recognition (ASR) sequences. + * @param {Array<{tokens: number[], token_timestamps?: number[], stride: number[]}>} sequences The sequences to decode. + * @param {Object} options The options to use for decoding. + * @returns {Array, text: string}>}>} The decoded sequences. + */ + _decode_asr( + sequences, + { + return_timestamps = false, + return_language = false, + time_precision = null, + force_full_sequences = true, + } = {}, + ) { + // Set force_full_sequences=false if you want streaming + // TODO add support for `return_language` - // Internal method meant to only be used by asr pipeline. - // Handles all the little quirks specific to whisper to handle - // the various options not allowed in other seq2seq models + // Internal method meant to only be used by asr pipeline. + // Handles all the little quirks specific to whisper to handle + // the various options not allowed in other seq2seq models - // =========== Overview ============ - // - iterate over all outputs - // - all tokens within output - // - Each token can be - // - language token - // - special token - // - timestamp token - // - text token - // - We accumulate the text tokens. - // - We split on end timestamps - // - Lots of complexity comes from stride and timestamps + // =========== Overview ============ + // - iterate over all outputs + // - all tokens within output + // - Each token can be + // - language token + // - special token + // - timestamp token + // - text token + // - We accumulate the text tokens. + // - We split on end timestamps + // - Lots of complexity comes from stride and timestamps - if (time_precision === null) { - throw Error("Must specify time_precision") - } - let last_language = null; + if (time_precision === null) { + throw Error("Must specify time_precision"); + } + let last_language = null; - const returnWordTimestamps = return_timestamps === "word"; - - function new_chunk() { - return { "language": last_language, "timestamp": [null, null], "text": "" }; - } - - // Welcome to the state machine! - const chunks = []; - let chunk = new_chunk(); - let time_offset = 0.0; - const timestamp_begin = this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0] + 1; - - let previous_tokens = []; - let previous_token_timestamps = []; - - let skip = false; - let right_stride_start = null; - - - const all_special_ids = new Set(this.all_special_ids); - - for (const output of sequences) { - // NOTE: python version has batches, so it uses [0] - const token_ids = output.tokens; - const token_timestamps = returnWordTimestamps ? output.token_timestamps : null; - - // These keep track of timestamps within strides, which need - // to be skipped and resolve all tokens in a single chunk. - let last_timestamp = null; - let first_timestamp = timestamp_begin; - - if ("stride" in output) { - const [chunk_len, stride_left, stride_right] = output.stride; - - // Offset the timings to account for the other `model_outputs`. - time_offset -= stride_left; - right_stride_start = chunk_len - stride_right; - - // Keeping track of timestamps within strides - // We're going to NOT split on those, and delay until we're - // out of BOTH stride. Otherwise lots of issues occur and - // corner cases - if (stride_left) { - first_timestamp = stride_left / time_precision + timestamp_begin; - } - - if (stride_right) { - for (let i = token_ids.length - 1; i >= 0; --i) { - const token = token_ids[i]; - if (token >= timestamp_begin) { - // There can be several token in the right stride - // But the last one is ALWAYS going to be skipped - if (last_timestamp !== null && (token - timestamp_begin) * time_precision < right_stride_start) { - break; - } - last_timestamp = token; - } - } - } - } - - let current_tokens = []; - let current_token_timestamps = []; - - // - all tokens within output - for (let i = 0; i < token_ids.length; ++i) { - const token = token_ids[i]; - // 4 possible states for each token - // - 1/ Language code - // - 2/ all other special tokens (which we ignore) - // - 3/ Timestamp - // - 4/ Regular text - - if (all_special_ids.has(token)) { - const text = this.decode([token]); - const language = WHISPER_LANGUAGE_MAPPING.get(text.slice(2, -2)); - - if (language !== undefined) { - // 1/ Indeed some language - // TODO Handle when language is different from the previous - // one, and we cannot use timestamped tokens to create chunks - if (last_language !== null && language !== last_language && !return_timestamps) { - previous_tokens.push(current_tokens); - const resolved_tokens = this.findLongestCommonSequence(previous_tokens)[0]; - const resolved_text = this.decode(resolved_tokens); - chunk.text = resolved_text; - chunks.push(chunk); - - // Flush all our temporary context - previous_tokens = []; - current_tokens = []; - chunk = new_chunk(); - } - - last_language = chunk.language = language; - } else { - // 2/ This is a regular special token, ignoring it - } - } else if (token >= timestamp_begin) { - // 3/ Timestamp token - const time = (token - timestamp_begin) * time_precision + time_offset; - const rounded_time = round(time, 2); - - if (last_timestamp !== null && token >= last_timestamp) { - // Whisper outputted a timestamp token, but it falls within - // our stride, so we're going to skip it for the time being - // and resolve this later - // Skip is necessary because timestamp tokens always come - // by pair, so we need to skip the next one too (which would mark the start of another chunk). - skip = true; - } else if (skip || (previous_tokens.length > 0 && token < first_timestamp)) { - skip = false; - } else if (chunk.timestamp[0] === null) { - chunk.timestamp[0] = rounded_time; - } else { - // This is the end of the timestamp chunk - if (rounded_time === chunk.timestamp[0]) { - // This is a bug in timestamp token output - // where we're taking the duplicate token - // as a stop where it should be a start. - // This is an issue in the underlying model output - // Let's just skip it so it becomes de-factor a start agin - } else { - chunk.timestamp[1] = rounded_time; - - // Handling merges - previous_tokens.push(current_tokens) - - if (returnWordTimestamps) { - previous_token_timestamps.push(current_token_timestamps); - } - const [resolved_tokens, resolved_token_timestamps] = this.findLongestCommonSequence( - previous_tokens, previous_token_timestamps - ) - - const resolved_text = this.decode(resolved_tokens) - chunk.text = resolved_text - - if (returnWordTimestamps) { - chunk.words = this.collateWordTimestamps( - resolved_tokens, resolved_token_timestamps, last_language, - ) - } - - chunks.push(chunk) - - // Flush all our temporary context - previous_tokens = [] - current_tokens = [] - previous_token_timestamps = [] - current_token_timestamps = [] - chunk = new_chunk() - } - } - - } else { - // 4/ Regular token - // We just append to the list of all tokens so we can handle - // merges later and decode into text. - current_tokens.push(token) - - if (returnWordTimestamps) { - let start_time = round(token_timestamps[i] + time_offset, 2); - - let end_time; - if (i + 1 < token_timestamps.length) { - end_time = round(token_timestamps[i + 1] + time_offset, 2); - } else { - // should never happen - end_time = null; - } - current_token_timestamps.push([start_time, end_time]); - } - - } - } - - if ('stride' in output) { - const [chunk_len, stride_left, stride_right] = output.stride; - time_offset += chunk_len - stride_right - } - - // Leftover tokens - if (current_tokens.length > 0) { - previous_tokens.push(current_tokens) - if (returnWordTimestamps) { - previous_token_timestamps.push(current_token_timestamps); - } - } else if (previous_tokens.every(p => p.length === 0)) { - // Flushing previous tokens (END)" - chunk = new_chunk() - previous_tokens = [] - current_tokens = [] - previous_token_timestamps = []; - current_token_timestamps = []; - } - - } - - if (previous_tokens.length > 0) { - if (force_full_sequences && return_timestamps) { - // Last token should always be timestamps, so there shouldn't be - // leftover - throw new Error( - "Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. " + - "Also make sure WhisperTimeStampLogitsProcessor was used during generation." - ); - } - - // Happens when we don't use timestamps - const [resolved_tokens, resolved_token_timestamps] = this.findLongestCommonSequence(previous_tokens, previous_token_timestamps); - - // Flushing previous tokens (FINAL) - const resolved_text = this.decode(resolved_tokens); - chunk.text = resolved_text; - if (returnWordTimestamps) { - chunk.words = this.collateWordTimestamps( - resolved_tokens, resolved_token_timestamps, last_language, - ) - } - chunks.push(chunk); - } - - let optional = Object.create(null); - - // Preparing and cleaning up the pipeline output - const full_text = chunks.map(chunk => chunk.text).join(''); - if (return_timestamps || return_language) { - for (let i = 0; i < chunks.length; ++i) { - const chunk = chunks[i]; - if (!return_timestamps) { - delete chunk["timestamp"]; - } - - if (!return_language) { - delete chunk["language"]; - } - } - if (returnWordTimestamps) { - const new_chunks = []; - for (const chunk of chunks) { - for (const word of chunk.words) { - new_chunks.push(word); - } - } - optional = { "chunks": new_chunks }; - } else { - optional = { "chunks": chunks }; - } - } - return [full_text, optional]; + const returnWordTimestamps = return_timestamps === "word"; + function new_chunk() { + return { language: last_language, timestamp: [null, null], text: "" }; } - /** - * Finds the longest common sequence among the provided sequences. - * @param {number[][]} sequences An array of sequences of token ids to compare. - * @returns {number[][]} The longest common sequence found. - * @throws {Error} If there is a bug within the function. - * @private - */ - findLongestCommonSequence(sequences, token_timestamp_sequences = null) { - // It would be much harder to do O(n) because of fault tolerance. - // We actually have a really good property which is that the total sequence - // MUST be those subsequences in order. - // If token_timestamp_sequences is provided, will split those sequences in - // exactly the same way. - let leftSequence = sequences[0]; - let leftLength = leftSequence.length; - let totalSequence = []; + // Welcome to the state machine! + const chunks = []; + let chunk = new_chunk(); + let time_offset = 0.0; + const timestamp_begin = + this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0] + 1; - const use_token_timestamp_sequences = Array.isArray(token_timestamp_sequences) && token_timestamp_sequences.length > 0; - let total_token_timestamp_sequence = use_token_timestamp_sequences ? [] : null; - let left_token_timestamp_sequence = use_token_timestamp_sequences ? token_timestamp_sequences[0] : null; - for (let i = 1; i < sequences.length; ++i) { - const rightSequence = sequences[i]; - let max = 0.0; - let maxIndices = [leftLength, leftLength, 0, 0]; - // Here we're sliding matches - // [a, b, c, d] - // [c, d, f] - // = [c] == [d] + let previous_tokens = []; + let previous_token_timestamps = []; - // [a, b, c, d] - // [c, d, f] - // = [c, d] == [c, d] + let skip = false; + let right_stride_start = null; + const all_special_ids = new Set(this.all_special_ids); - // [a, b, c, d] - // [c, d, f] + for (const output of sequences) { + // NOTE: python version has batches, so it uses [0] + const token_ids = output.tokens; + const token_timestamps = returnWordTimestamps + ? output.token_timestamps + : null; - // = [b, c, d] == [c, d, f] + // These keep track of timestamps within strides, which need + // to be skipped and resolve all tokens in a single chunk. + let last_timestamp = null; + let first_timestamp = timestamp_begin; - // [a, b, c, d] - // [c, d, f] + if ("stride" in output) { + const [chunk_len, stride_left, stride_right] = output.stride; - // [a, b, c] == [c, d, f] + // Offset the timings to account for the other `model_outputs`. + time_offset -= stride_left; + right_stride_start = chunk_len - stride_right; - // [a, b, c, d] - // [d, f] - - // [a, b] == [d, f] - - // [a, b, c, d] - // [f] - - // [a] == [f] - - const rightLength = rightSequence.length; - for (let j = 1; j < leftLength + rightLength; ++j) { - const eps = j / 10000.0; - const leftStart = Math.max(0, leftLength - j); - const leftStop = Math.min(leftLength, leftLength + rightLength - j); - const left = leftSequence.slice(leftStart, leftStop); - const rightStart = Math.max(0, j - leftLength); - const rightStop = Math.min(rightLength, j); - const right = rightSequence.slice(rightStart, rightStop); - if (left.length !== right.length) { - throw new Error("There is a bug within whisper `decode_asr` function, please report it. Dropping to prevent bad inference."); - } - const matches = left.filter((elem, idx) => elem === right[idx]).length; - const matching = matches / j + eps; - if (matches > 1 && matching > max) { - max = matching; - maxIndices = [leftStart, leftStop, rightStart, rightStop]; - } - } - const [leftStart, leftStop, rightStart, rightStop] = maxIndices; - const leftMid = Math.floor((leftStop + leftStart) / 2); - const rightMid = Math.floor((rightStop + rightStart) / 2); - totalSequence.push(...leftSequence.slice(0, leftMid)); - leftSequence = rightSequence.slice(rightMid); - leftLength = leftSequence.length; - - if (use_token_timestamp_sequences) { - total_token_timestamp_sequence.push(...left_token_timestamp_sequence.slice(0, leftMid)); - left_token_timestamp_sequence = token_timestamp_sequences[i].slice(rightMid); - } - } - totalSequence.push(...leftSequence); - - if (use_token_timestamp_sequences) { - total_token_timestamp_sequence.push(...left_token_timestamp_sequence); - return [totalSequence, total_token_timestamp_sequence]; - } else { - return [totalSequence, []]; - } - } - - /** @private */ - collateWordTimestamps(tokens, token_timestamps, language) { - - const [words, _, token_indices] = this.combineTokensIntoWords(tokens, language); - - const timings = []; - for (let i = 0; i < words.length; ++i) { - const indices = token_indices[i]; - timings.push({ - text: words[i], - timestamp: [ - token_timestamps[indices.at(0)][0], - token_timestamps[indices.at(-1)][1], - ], - }); - } - return timings; - } - - /** - * Groups tokens by word. Returns a tuple containing a list of strings with the words, - * and a list of `token_id` sequences with the tokens making up each word. - * @param {number[]} tokens - * @param {string} [language] - * @param {string} prepend_punctionations - * @param {string} append_punctuations - * - * @private - */ - combineTokensIntoWords(tokens, language, prepend_punctionations = "\"'โ€œยกยฟ([{-", append_punctuations = "\"'.ใ€‚,๏ผŒ!๏ผ?๏ผŸ:๏ผšโ€)]}ใ€") { - language = language ?? 'english'; - - let words, word_tokens, token_indices; - - if (["chinese", "japanese", "thai", "lao", "myanmar"].includes(language)) { - // These languages don't typically use spaces. - [words, word_tokens, token_indices] = this.splitTokensOnUnicode(tokens) - } else { - [words, word_tokens, token_indices] = this.splitTokensOnSpaces(tokens) + // Keeping track of timestamps within strides + // We're going to NOT split on those, and delay until we're + // out of BOTH stride. Otherwise lots of issues occur and + // corner cases + if (stride_left) { + first_timestamp = stride_left / time_precision + timestamp_begin; } - return this.mergePunctuations(words, word_tokens, token_indices, prepend_punctionations, append_punctuations); - } - - /** @type {PreTrainedTokenizer['decode']} */ - decode( - token_ids, - decode_args, - ) { - let text; - // @ts-ignore - if (decode_args && decode_args.decode_with_timestamps) { - if (token_ids instanceof Tensor) { - token_ids = prepareTensorForDecode(token_ids); - } - text = this.decodeWithTimestamps(token_ids, decode_args); - } else { - text = super.decode(token_ids, decode_args); - } - // TODO: implement offsets - // if (decode_args.output_offsets) { - // let offsets = this.computeOffsets - // } - return text; - } - - /** - * @param {number[]} token_ids List of token IDs to decode. - * @param {Object} decode_args Optional arguments for decoding - * @private - */ - decodeWithTimestamps(token_ids, decode_args) { - const time_precision = decode_args?.time_precision ?? 0.02; - - const timestamp_begin = Array.from(this.all_special_ids).at(-1) + 1; - /**@type {Array} */ - let outputs = [[]]; - for (const token of token_ids) { + if (stride_right) { + for (let i = token_ids.length - 1; i >= 0; --i) { + const token = token_ids[i]; if (token >= timestamp_begin) { - const timestamp = round((token - timestamp_begin) * time_precision, 2); - outputs.push(`<|${timestamp}|>`); - outputs.push([]); + // There can be several token in the right stride + // But the last one is ALWAYS going to be skipped + if ( + last_timestamp !== null && + (token - timestamp_begin) * time_precision < right_stride_start + ) { + break; + } + last_timestamp = token; + } + } + } + } + + let current_tokens = []; + let current_token_timestamps = []; + + // - all tokens within output + for (let i = 0; i < token_ids.length; ++i) { + const token = token_ids[i]; + // 4 possible states for each token + // - 1/ Language code + // - 2/ all other special tokens (which we ignore) + // - 3/ Timestamp + // - 4/ Regular text + + if (all_special_ids.has(token)) { + const text = this.decode([token]); + const language = WHISPER_LANGUAGE_MAPPING.get(text.slice(2, -2)); + + if (language !== undefined) { + // 1/ Indeed some language + // TODO Handle when language is different from the previous + // one, and we cannot use timestamped tokens to create chunks + if ( + last_language !== null && + language !== last_language && + !return_timestamps + ) { + previous_tokens.push(current_tokens); + const resolved_tokens = + this.findLongestCommonSequence(previous_tokens)[0]; + const resolved_text = this.decode(resolved_tokens); + chunk.text = resolved_text; + chunks.push(chunk); + + // Flush all our temporary context + previous_tokens = []; + current_tokens = []; + chunk = new_chunk(); + } + + last_language = chunk.language = language; + } else { + // 2/ This is a regular special token, ignoring it + } + } else if (token >= timestamp_begin) { + // 3/ Timestamp token + const time = (token - timestamp_begin) * time_precision + time_offset; + const rounded_time = round(time, 2); + + if (last_timestamp !== null && token >= last_timestamp) { + // Whisper outputted a timestamp token, but it falls within + // our stride, so we're going to skip it for the time being + // and resolve this later + // Skip is necessary because timestamp tokens always come + // by pair, so we need to skip the next one too (which would mark the start of another chunk). + skip = true; + } else if ( + skip || + (previous_tokens.length > 0 && token < first_timestamp) + ) { + skip = false; + } else if (chunk.timestamp[0] === null) { + chunk.timestamp[0] = rounded_time; + } else { + // This is the end of the timestamp chunk + if (rounded_time === chunk.timestamp[0]) { + // This is a bug in timestamp token output + // where we're taking the duplicate token + // as a stop where it should be a start. + // This is an issue in the underlying model output + // Let's just skip it so it becomes de-factor a start agin } else { - outputs[outputs.length - 1].push(token); + chunk.timestamp[1] = rounded_time; + + // Handling merges + previous_tokens.push(current_tokens); + + if (returnWordTimestamps) { + previous_token_timestamps.push(current_token_timestamps); + } + const [resolved_tokens, resolved_token_timestamps] = + this.findLongestCommonSequence( + previous_tokens, + previous_token_timestamps, + ); + + const resolved_text = this.decode(resolved_tokens); + chunk.text = resolved_text; + + if (returnWordTimestamps) { + chunk.words = this.collateWordTimestamps( + resolved_tokens, + resolved_token_timestamps, + last_language, + ); + } + + chunks.push(chunk); + + // Flush all our temporary context + previous_tokens = []; + current_tokens = []; + previous_token_timestamps = []; + current_token_timestamps = []; + chunk = new_chunk(); } - } - outputs = outputs.map( - s => { - if (typeof s === 'string') { - return s; - } else { - return super.decode(s, decode_args); - } - } - ) - - return outputs.join(''); - } - - /** - * Combine tokens into words by splitting at any position where the tokens are decoded as valid unicode points. - * @param {number[]} tokens - * @returns {*} - * @private - */ - splitTokensOnUnicode(tokens) { - const decoded_full = this.decode(tokens, { - // @ts-ignore - decode_with_timestamps: true, - }); - const replacement_char = '\uFFFD'; - - const words = [] - const word_tokens = [] - const token_indices = [] - let current_tokens = [] - let current_indices = [] - let unicode_offset = 0 - - for (let token_idx = 0; token_idx < tokens.length; ++token_idx) { - const token = tokens[token_idx]; - - current_tokens.push(token); - current_indices.push(token_idx); - - const decoded = this.decode(current_tokens, { - // @ts-ignore - decode_with_timestamps: true, - }); - - if (!decoded.includes(replacement_char) || decoded_full[unicode_offset + decoded.indexOf(replacement_char)] === replacement_char) { - words.push(decoded) - word_tokens.push(current_tokens) - token_indices.push(current_indices) - current_tokens = [] - current_indices = [] - unicode_offset += decoded.length; - } - - } - - return [words, word_tokens, token_indices] - } - - /** - * Combine tokens into words by splitting at whitespace and punctuation tokens. - * @param {number[]} tokens - * @private - */ - splitTokensOnSpaces(tokens) { - - const [subwords, subword_tokens_list, subword_indices_list] = this.splitTokensOnUnicode(tokens); - - const words = [] - const word_tokens = [] - const token_indices = [] - - const punctuationRegex = new RegExp(`^[${PUNCTUATION_REGEX}]$`, 'gu'); - - for (let i = 0; i < subwords.length; ++i) { - - const subword = subwords[i]; - const subword_tokens = subword_tokens_list[i]; - const subword_indices = subword_indices_list[i]; - - // @ts-ignore - const special = subword_tokens[0] >= this.model.tokens_to_ids.get('<|endoftext|>'); - const with_space = subword.startsWith(' '); - const trimmed = subword.trim(); - const punctuation = punctuationRegex.test(trimmed); - - if (special || with_space || punctuation || words.length === 0) { - words.push(subword); - word_tokens.push(subword_tokens); - token_indices.push(subword_indices); - } else { - const ix = words.length - 1; - words[ix] += subword; - word_tokens[ix].push(...subword_tokens); - token_indices[ix].push(...subword_indices); - } - } - - return [words, word_tokens, token_indices]; - - } - - /** - * Merges punctuation tokens with neighboring words. - * @param {string[]} words - * @param {number[][]} tokens - * @param {number[][]} indices - * @param {string} prepended - * @param {string} appended - * @private - */ - mergePunctuations(words, tokens, indices, prepended, appended) { - - const newWords = structuredClone(words); - const newTokens = structuredClone(tokens); - const newIndices = structuredClone(indices); - - - // prepend punctuations - let i = newWords.length - 2; - let j = newWords.length - 1; - - while (i >= 0) { - if (newWords[i].startsWith(' ') && prepended.includes(newWords[i].trim())) { - newWords[j] = newWords[i] + newWords[j]; - newTokens[j] = mergeArrays(newTokens[i], newTokens[j]); - newIndices[j] = mergeArrays(newIndices[i], newIndices[j]); - newWords[i] = ''; - newTokens[i] = []; - newIndices[i] = []; - } else { - j = i; - } - --i; - } - - // append punctuations - i = 0; - j = 1; - while (j < newWords.length) { - if (!newWords[i].endsWith(' ') && appended.includes(newWords[j])) { - newWords[i] += newWords[j]; - newTokens[i] = mergeArrays(newTokens[i], newTokens[j]); - newIndices[i] = mergeArrays(newIndices[i], newIndices[j]); - newWords[j] = ''; - newTokens[j] = []; - newIndices[j] = []; - } else { - i = j; - } - ++j; - } - - return [ - newWords.filter(x => x), - newTokens.filter(x => x.length > 0), - newIndices.filter(x => x.length > 0), - ] - } - - /** - * Helper function to build translation inputs for a `WhisperTokenizer`, - * depending on the language, task, and whether to predict timestamp tokens. - * - * Used to override the prefix tokens appended to the start of the label sequence. - * - * **Example: Get ids for a language** - * ```javascript - * // instantiate the tokenizer and set the prefix token to Spanish - * const tokenizer = await WhisperTokenizer.from_pretrained('Xenova/whisper-tiny'); - * const forced_decoder_ids = tokenizer.get_decoder_prompt_ids({ language: 'spanish' }); - * // [(1, 50262), (2, 50363)] - * ``` - * - * @param {Object} options Options to generate the decoder prompt. - * @param {string} [options.language] The language of the transcription text. - * The corresponding language id token is appended to the start of the sequence for multilingual - * speech recognition and speech translation tasks, e.g. for "Spanish" the token "<|es|>" is appended - * to the start of sequence. - * @param {string} [options.task] Task identifier to append at the start of sequence (if any). - * This should be used for mulitlingual fine-tuning, with "transcribe" for speech recognition and - * "translate" for speech translation. - * @param {boolean} [options.no_timestamps] Whether to add the <|notimestamps|> token at the start of the sequence. - * @returns {number[][]} The decoder prompt ids. - */ - get_decoder_prompt_ids({ - language = null, - task = null, - no_timestamps = true, - } = {}) { - - // <|lang_id|> <|task|> <|notimestamps|> - - const forced_decoder_ids = []; - - if (language) { - // User wishes to specify the language - language = language.toLowerCase(); - - // Map to code from user-friendly name (e.g., "english" -> "en") - let language_code = WHISPER_TO_LANGUAGE_CODE_MAPPING.get(language); - - if (language_code === undefined) { - // User provided something that is not a language name - - if (WHISPER_LANGUAGE_MAPPING.has(language)) { - // User provided the language code directly (e.g., "en") - language_code = language; - - } else { - // User provided something that is not a language code or name - const is_language_code = language.length === 2; - const langs = is_language_code ? WHISPER_LANGUAGE_MAPPING.keys() : WHISPER_LANGUAGE_MAPPING.values(); - - throw new Error(`Language "${language}" is not supported. Must be one of: ${JSON.stringify(langs)}`); - } - } - - const language_token_id = this.model.tokens_to_ids.get(`<|${language_code}|>`); - if (language_token_id === undefined) { - throw new Error(`Unable to find language "${language_code}" in model vocabulary. Please report this issue at https://github.com/xenova/transformers.js/issues/new/choose.`) - } - - forced_decoder_ids.push(language_token_id); + } } else { - // No token will be forced, which leaves the model to predict the language - forced_decoder_ids.push(null); - } + // 4/ Regular token + // We just append to the list of all tokens so we can handle + // merges later and decode into text. + current_tokens.push(token); - if (task) { - task = task.toLowerCase(); - if (task !== 'transcribe' && task !== 'translate') { - throw new Error(`Task "${task}" is not supported. Must be one of: ["transcribe", "translate"]`); + if (returnWordTimestamps) { + let start_time = round(token_timestamps[i] + time_offset, 2); + + let end_time; + if (i + 1 < token_timestamps.length) { + end_time = round(token_timestamps[i + 1] + time_offset, 2); + } else { + // should never happen + end_time = null; } - - const task_token_id = this.model.tokens_to_ids.get(`<|${task}|>`); - if (task_token_id === undefined) { - throw new Error(`Unable to find task "${task}" in model vocabulary. Please report this issue at https://github.com/xenova/transformers.js/issues/new/choose.`) - } - - forced_decoder_ids.push(task_token_id); - } else { - // No token will be forced, which leaves the model to predict the task - forced_decoder_ids.push(null); + current_token_timestamps.push([start_time, end_time]); + } } + } - if (no_timestamps) { - const no_timestamps_id = this.model.tokens_to_ids.get(`<|notimestamps|>`); - if (no_timestamps_id === undefined) { - throw new Error('Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at https://github.com/xenova/transformers.js/issues/new/choose.') - } + if ("stride" in output) { + const [chunk_len, stride_left, stride_right] = output.stride; + time_offset += chunk_len - stride_right; + } - forced_decoder_ids.push(no_timestamps_id); + // Leftover tokens + if (current_tokens.length > 0) { + previous_tokens.push(current_tokens); + if (returnWordTimestamps) { + previous_token_timestamps.push(current_token_timestamps); } - - return forced_decoder_ids.map((x, i) => [i + 1, x]).filter(x => x[1] !== null); - + } else if (previous_tokens.every((p) => p.length === 0)) { + // Flushing previous tokens (END)" + chunk = new_chunk(); + previous_tokens = []; + current_tokens = []; + previous_token_timestamps = []; + current_token_timestamps = []; + } } + + if (previous_tokens.length > 0) { + if (force_full_sequences && return_timestamps) { + // Last token should always be timestamps, so there shouldn't be + // leftover + throw new Error( + "Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. " + + "Also make sure WhisperTimeStampLogitsProcessor was used during generation.", + ); + } + + // Happens when we don't use timestamps + const [resolved_tokens, resolved_token_timestamps] = + this.findLongestCommonSequence( + previous_tokens, + previous_token_timestamps, + ); + + // Flushing previous tokens (FINAL) + const resolved_text = this.decode(resolved_tokens); + chunk.text = resolved_text; + if (returnWordTimestamps) { + chunk.words = this.collateWordTimestamps( + resolved_tokens, + resolved_token_timestamps, + last_language, + ); + } + chunks.push(chunk); + } + + let optional = Object.create(null); + + // Preparing and cleaning up the pipeline output + const full_text = chunks.map((chunk) => chunk.text).join(""); + if (return_timestamps || return_language) { + for (let i = 0; i < chunks.length; ++i) { + const chunk = chunks[i]; + if (!return_timestamps) { + delete chunk["timestamp"]; + } + + if (!return_language) { + delete chunk["language"]; + } + } + if (returnWordTimestamps) { + const new_chunks = []; + for (const chunk of chunks) { + for (const word of chunk.words) { + new_chunks.push(word); + } + } + optional = { chunks: new_chunks }; + } else { + optional = { chunks: chunks }; + } + } + return [full_text, optional]; + } + + /** + * Finds the longest common sequence among the provided sequences. + * @param {number[][]} sequences An array of sequences of token ids to compare. + * @returns {number[][]} The longest common sequence found. + * @throws {Error} If there is a bug within the function. + * @private + */ + findLongestCommonSequence(sequences, token_timestamp_sequences = null) { + // It would be much harder to do O(n) because of fault tolerance. + // We actually have a really good property which is that the total sequence + // MUST be those subsequences in order. + // If token_timestamp_sequences is provided, will split those sequences in + // exactly the same way. + let leftSequence = sequences[0]; + let leftLength = leftSequence.length; + let totalSequence = []; + + const use_token_timestamp_sequences = + Array.isArray(token_timestamp_sequences) && + token_timestamp_sequences.length > 0; + let total_token_timestamp_sequence = use_token_timestamp_sequences + ? [] + : null; + let left_token_timestamp_sequence = use_token_timestamp_sequences + ? token_timestamp_sequences[0] + : null; + for (let i = 1; i < sequences.length; ++i) { + const rightSequence = sequences[i]; + let max = 0.0; + let maxIndices = [leftLength, leftLength, 0, 0]; + // Here we're sliding matches + // [a, b, c, d] + // [c, d, f] + // = [c] == [d] + + // [a, b, c, d] + // [c, d, f] + // = [c, d] == [c, d] + + // [a, b, c, d] + // [c, d, f] + + // = [b, c, d] == [c, d, f] + + // [a, b, c, d] + // [c, d, f] + + // [a, b, c] == [c, d, f] + + // [a, b, c, d] + // [d, f] + + // [a, b] == [d, f] + + // [a, b, c, d] + // [f] + + // [a] == [f] + + const rightLength = rightSequence.length; + for (let j = 1; j < leftLength + rightLength; ++j) { + const eps = j / 10000.0; + const leftStart = Math.max(0, leftLength - j); + const leftStop = Math.min(leftLength, leftLength + rightLength - j); + const left = leftSequence.slice(leftStart, leftStop); + const rightStart = Math.max(0, j - leftLength); + const rightStop = Math.min(rightLength, j); + const right = rightSequence.slice(rightStart, rightStop); + if (left.length !== right.length) { + throw new Error( + "There is a bug within whisper `decode_asr` function, please report it. Dropping to prevent bad inference.", + ); + } + const matches = left.filter((elem, idx) => elem === right[idx]).length; + const matching = matches / j + eps; + if (matches > 1 && matching > max) { + max = matching; + maxIndices = [leftStart, leftStop, rightStart, rightStop]; + } + } + const [leftStart, leftStop, rightStart, rightStop] = maxIndices; + const leftMid = Math.floor((leftStop + leftStart) / 2); + const rightMid = Math.floor((rightStop + rightStart) / 2); + totalSequence.push(...leftSequence.slice(0, leftMid)); + leftSequence = rightSequence.slice(rightMid); + leftLength = leftSequence.length; + + if (use_token_timestamp_sequences) { + total_token_timestamp_sequence.push( + ...left_token_timestamp_sequence.slice(0, leftMid), + ); + left_token_timestamp_sequence = + token_timestamp_sequences[i].slice(rightMid); + } + } + totalSequence.push(...leftSequence); + + if (use_token_timestamp_sequences) { + total_token_timestamp_sequence.push(...left_token_timestamp_sequence); + return [totalSequence, total_token_timestamp_sequence]; + } else { + return [totalSequence, []]; + } + } + + /** @private */ + collateWordTimestamps(tokens, token_timestamps, language) { + const [words, _, token_indices] = this.combineTokensIntoWords( + tokens, + language, + ); + + const timings = []; + for (let i = 0; i < words.length; ++i) { + const indices = token_indices[i]; + timings.push({ + text: words[i], + timestamp: [ + token_timestamps[indices.at(0)][0], + token_timestamps[indices.at(-1)][1], + ], + }); + } + return timings; + } + + /** + * Groups tokens by word. Returns a tuple containing a list of strings with the words, + * and a list of `token_id` sequences with the tokens making up each word. + * @param {number[]} tokens + * @param {string} [language] + * @param {string} prepend_punctionations + * @param {string} append_punctuations + * + * @private + */ + combineTokensIntoWords( + tokens, + language, + prepend_punctionations = "\"'โ€œยกยฟ([{-", + append_punctuations = "\"'.ใ€‚,๏ผŒ!๏ผ?๏ผŸ:๏ผšโ€)]}ใ€", + ) { + language = language ?? "english"; + + let words, word_tokens, token_indices; + + if (["chinese", "japanese", "thai", "lao", "myanmar"].includes(language)) { + // These languages don't typically use spaces. + [words, word_tokens, token_indices] = this.splitTokensOnUnicode(tokens); + } else { + [words, word_tokens, token_indices] = this.splitTokensOnSpaces(tokens); + } + + return this.mergePunctuations( + words, + word_tokens, + token_indices, + prepend_punctionations, + append_punctuations, + ); + } + + /** @type {PreTrainedTokenizer['decode']} */ + decode(token_ids, decode_args) { + let text; + // @ts-ignore + if (decode_args && decode_args.decode_with_timestamps) { + if (token_ids instanceof Tensor) { + token_ids = prepareTensorForDecode(token_ids); + } + text = this.decodeWithTimestamps(token_ids, decode_args); + } else { + text = super.decode(token_ids, decode_args); + } + // TODO: implement offsets + // if (decode_args.output_offsets) { + // let offsets = this.computeOffsets + // } + return text; + } + + /** + * @param {number[]} token_ids List of token IDs to decode. + * @param {Object} decode_args Optional arguments for decoding + * @private + */ + decodeWithTimestamps(token_ids, decode_args) { + const time_precision = decode_args?.time_precision ?? 0.02; + + const timestamp_begin = Array.from(this.all_special_ids).at(-1) + 1; + /**@type {Array} */ + let outputs = [[]]; + for (const token of token_ids) { + if (token >= timestamp_begin) { + const timestamp = round((token - timestamp_begin) * time_precision, 2); + outputs.push(`<|${timestamp}|>`); + outputs.push([]); + } else { + outputs[outputs.length - 1].push(token); + } + } + outputs = outputs.map((s) => { + if (typeof s === "string") { + return s; + } else { + return super.decode(s, decode_args); + } + }); + + return outputs.join(""); + } + + /** + * Combine tokens into words by splitting at any position where the tokens are decoded as valid unicode points. + * @param {number[]} tokens + * @returns {*} + * @private + */ + splitTokensOnUnicode(tokens) { + const decoded_full = this.decode(tokens, { + // @ts-ignore + decode_with_timestamps: true, + }); + const replacement_char = "\uFFFD"; + + const words = []; + const word_tokens = []; + const token_indices = []; + let current_tokens = []; + let current_indices = []; + let unicode_offset = 0; + + for (let token_idx = 0; token_idx < tokens.length; ++token_idx) { + const token = tokens[token_idx]; + + current_tokens.push(token); + current_indices.push(token_idx); + + const decoded = this.decode(current_tokens, { + // @ts-ignore + decode_with_timestamps: true, + }); + + if ( + !decoded.includes(replacement_char) || + decoded_full[unicode_offset + decoded.indexOf(replacement_char)] === + replacement_char + ) { + words.push(decoded); + word_tokens.push(current_tokens); + token_indices.push(current_indices); + current_tokens = []; + current_indices = []; + unicode_offset += decoded.length; + } + } + + return [words, word_tokens, token_indices]; + } + + /** + * Combine tokens into words by splitting at whitespace and punctuation tokens. + * @param {number[]} tokens + * @private + */ + splitTokensOnSpaces(tokens) { + const [subwords, subword_tokens_list, subword_indices_list] = + this.splitTokensOnUnicode(tokens); + + const words = []; + const word_tokens = []; + const token_indices = []; + + const punctuationRegex = new RegExp(`^[${PUNCTUATION_REGEX}]$`, "gu"); + + for (let i = 0; i < subwords.length; ++i) { + const subword = subwords[i]; + const subword_tokens = subword_tokens_list[i]; + const subword_indices = subword_indices_list[i]; + + // @ts-ignore + const special = + subword_tokens[0] >= this.model.tokens_to_ids.get("<|endoftext|>"); + const with_space = subword.startsWith(" "); + const trimmed = subword.trim(); + const punctuation = punctuationRegex.test(trimmed); + + if (special || with_space || punctuation || words.length === 0) { + words.push(subword); + word_tokens.push(subword_tokens); + token_indices.push(subword_indices); + } else { + const ix = words.length - 1; + words[ix] += subword; + word_tokens[ix].push(...subword_tokens); + token_indices[ix].push(...subword_indices); + } + } + + return [words, word_tokens, token_indices]; + } + + /** + * Merges punctuation tokens with neighboring words. + * @param {string[]} words + * @param {number[][]} tokens + * @param {number[][]} indices + * @param {string} prepended + * @param {string} appended + * @private + */ + mergePunctuations(words, tokens, indices, prepended, appended) { + const newWords = structuredClone(words); + const newTokens = structuredClone(tokens); + const newIndices = structuredClone(indices); + + // prepend punctuations + let i = newWords.length - 2; + let j = newWords.length - 1; + + while (i >= 0) { + if ( + newWords[i].startsWith(" ") && + prepended.includes(newWords[i].trim()) + ) { + newWords[j] = newWords[i] + newWords[j]; + newTokens[j] = mergeArrays(newTokens[i], newTokens[j]); + newIndices[j] = mergeArrays(newIndices[i], newIndices[j]); + newWords[i] = ""; + newTokens[i] = []; + newIndices[i] = []; + } else { + j = i; + } + --i; + } + + // append punctuations + i = 0; + j = 1; + while (j < newWords.length) { + if (!newWords[i].endsWith(" ") && appended.includes(newWords[j])) { + newWords[i] += newWords[j]; + newTokens[i] = mergeArrays(newTokens[i], newTokens[j]); + newIndices[i] = mergeArrays(newIndices[i], newIndices[j]); + newWords[j] = ""; + newTokens[j] = []; + newIndices[j] = []; + } else { + i = j; + } + ++j; + } + + return [ + newWords.filter((x) => x), + newTokens.filter((x) => x.length > 0), + newIndices.filter((x) => x.length > 0), + ]; + } + + /** + * Helper function to build translation inputs for a `WhisperTokenizer`, + * depending on the language, task, and whether to predict timestamp tokens. + * + * Used to override the prefix tokens appended to the start of the label sequence. + * + * **Example: Get ids for a language** + * ```javascript + * // instantiate the tokenizer and set the prefix token to Spanish + * const tokenizer = await WhisperTokenizer.from_pretrained('Xenova/whisper-tiny'); + * const forced_decoder_ids = tokenizer.get_decoder_prompt_ids({ language: 'spanish' }); + * // [(1, 50262), (2, 50363)] + * ``` + * + * @param {Object} options Options to generate the decoder prompt. + * @param {string} [options.language] The language of the transcription text. + * The corresponding language id token is appended to the start of the sequence for multilingual + * speech recognition and speech translation tasks, e.g. for "Spanish" the token "<|es|>" is appended + * to the start of sequence. + * @param {string} [options.task] Task identifier to append at the start of sequence (if any). + * This should be used for mulitlingual fine-tuning, with "transcribe" for speech recognition and + * "translate" for speech translation. + * @param {boolean} [options.no_timestamps] Whether to add the <|notimestamps|> token at the start of the sequence. + * @returns {number[][]} The decoder prompt ids. + */ + get_decoder_prompt_ids({ + language = null, + task = null, + no_timestamps = true, + } = {}) { + // <|lang_id|> <|task|> <|notimestamps|> + + const forced_decoder_ids = []; + + if (language) { + // User wishes to specify the language + language = language.toLowerCase(); + + // Map to code from user-friendly name (e.g., "english" -> "en") + let language_code = WHISPER_TO_LANGUAGE_CODE_MAPPING.get(language); + + if (language_code === undefined) { + // User provided something that is not a language name + + if (WHISPER_LANGUAGE_MAPPING.has(language)) { + // User provided the language code directly (e.g., "en") + language_code = language; + } else { + // User provided something that is not a language code or name + const is_language_code = language.length === 2; + const langs = is_language_code + ? WHISPER_LANGUAGE_MAPPING.keys() + : WHISPER_LANGUAGE_MAPPING.values(); + + throw new Error( + `Language "${language}" is not supported. Must be one of: ${JSON.stringify(langs)}`, + ); + } + } + + const language_token_id = this.model.tokens_to_ids.get( + `<|${language_code}|>`, + ); + if (language_token_id === undefined) { + throw new Error( + `Unable to find language "${language_code}" in model vocabulary. Please report this issue at https://github.com/xenova/transformers.js/issues/new/choose.`, + ); + } + + forced_decoder_ids.push(language_token_id); + } else { + // No token will be forced, which leaves the model to predict the language + forced_decoder_ids.push(null); + } + + if (task) { + task = task.toLowerCase(); + if (task !== "transcribe" && task !== "translate") { + throw new Error( + `Task "${task}" is not supported. Must be one of: ["transcribe", "translate"]`, + ); + } + + const task_token_id = this.model.tokens_to_ids.get(`<|${task}|>`); + if (task_token_id === undefined) { + throw new Error( + `Unable to find task "${task}" in model vocabulary. Please report this issue at https://github.com/xenova/transformers.js/issues/new/choose.`, + ); + } + + forced_decoder_ids.push(task_token_id); + } else { + // No token will be forced, which leaves the model to predict the task + forced_decoder_ids.push(null); + } + + if (no_timestamps) { + const no_timestamps_id = this.model.tokens_to_ids.get(`<|notimestamps|>`); + if (no_timestamps_id === undefined) { + throw new Error( + 'Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at https://github.com/xenova/transformers.js/issues/new/choose.', + ); + } + + forced_decoder_ids.push(no_timestamps_id); + } + + return forced_decoder_ids + .map((x, i) => [i + 1, x]) + .filter((x) => x[1] !== null); + } } -export class CodeGenTokenizer extends PreTrainedTokenizer { } -export class CLIPTokenizer extends PreTrainedTokenizer { } -export class SiglipTokenizer extends PreTrainedTokenizer { } +export class CodeGenTokenizer extends PreTrainedTokenizer {} +export class CLIPTokenizer extends PreTrainedTokenizer {} +export class SiglipTokenizer extends PreTrainedTokenizer {} /** * @todo This model is not yet supported by Hugging Face's "fast" tokenizers library (https://github.com/huggingface/tokenizers). * Therefore, this implementation (which is based on fast tokenizers) may produce slightly inaccurate results. */ export class MarianTokenizer extends PreTrainedTokenizer { - /** - * Create a new MarianTokenizer instance. - * @param {Object} tokenizerJSON The JSON of the tokenizer. - * @param {Object} tokenizerConfig The config of the tokenizer. - */ - constructor(tokenizerJSON, tokenizerConfig) { - super(tokenizerJSON, tokenizerConfig); + /** + * Create a new MarianTokenizer instance. + * @param {Object} tokenizerJSON The JSON of the tokenizer. + * @param {Object} tokenizerConfig The config of the tokenizer. + */ + constructor(tokenizerJSON, tokenizerConfig) { + super(tokenizerJSON, tokenizerConfig); - this.languageRegex = /^(>>\w+<<)\s*/g; + this.languageRegex = /^(>>\w+<<)\s*/g; - this.supported_language_codes = this.model.vocab.filter( - x => this.languageRegex.test(x) + this.supported_language_codes = this.model.vocab.filter((x) => + this.languageRegex.test(x), + ); + + console.warn( + 'WARNING: `MarianTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.', + ); + } + + /** + * Encodes a single text. Overriding this method is necessary since the language codes + * must be removed before encoding with sentencepiece model. + * @see https://github.com/huggingface/transformers/blob/12d51db243a00726a548a43cc333390ebae731e3/src/transformers/models/marian/tokenization_marian.py#L204-L213 + * + * @param {string|null} text The text to encode. + * @returns {Array} The encoded tokens. + */ + _encode_text(text) { + if (text === null) return null; + + // Check if text starts with language code: + const [matchInfo, ...remainder] = text.trim().split(this.languageRegex); + + if (remainder.length === 0) { + // No language code, encode normally + return super._encode_text(matchInfo); + } else if (remainder.length === 2) { + // Text starts with language code, so we do not encode it with sentencepiece. + const [language, text] = remainder; + + if (!this.supported_language_codes.includes(language)) { + console.warn( + `Unsupported language code "${language}" detected, which may lead to unexpected behavior. Should be one of: ${JSON.stringify(this.supported_language_codes)}`, ); - - console.warn('WARNING: `MarianTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.') + } + return mergeArrays([language], super._encode_text(text)); } - - /** - * Encodes a single text. Overriding this method is necessary since the language codes - * must be removed before encoding with sentencepiece model. - * @see https://github.com/huggingface/transformers/blob/12d51db243a00726a548a43cc333390ebae731e3/src/transformers/models/marian/tokenization_marian.py#L204-L213 - * - * @param {string|null} text The text to encode. - * @returns {Array} The encoded tokens. - */ - _encode_text(text) { - if (text === null) return null; - - // Check if text starts with language code: - const [matchInfo, ...remainder] = text.trim().split(this.languageRegex); - - if (remainder.length === 0) { - // No language code, encode normally - return super._encode_text(matchInfo); - - } else if (remainder.length === 2) { - // Text starts with language code, so we do not encode it with sentencepiece. - const [language, text] = remainder; - - if (!this.supported_language_codes.includes(language)) { - console.warn(`Unsupported language code "${language}" detected, which may lead to unexpected behavior. Should be one of: ${JSON.stringify(this.supported_language_codes)}`) - } - return mergeArrays([language], super._encode_text(text)); - } - } - + } } -export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer { } +export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer {} export class BlenderbotTokenizer extends PreTrainedTokenizer { - _default_chat_template = `{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}`; + _default_chat_template = `{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}`; } -export class BlenderbotSmallTokenizer extends BlenderbotTokenizer { } // NOTE `BlenderbotTokenizer` to get the correct chat template +export class BlenderbotSmallTokenizer extends BlenderbotTokenizer {} // NOTE `BlenderbotTokenizer` to get the correct chat template -export class SpeechT5Tokenizer extends PreTrainedTokenizer { } +export class SpeechT5Tokenizer extends PreTrainedTokenizer {} -export class NougatTokenizer extends PreTrainedTokenizer { } +export class NougatTokenizer extends PreTrainedTokenizer {} export class VitsTokenizer extends PreTrainedTokenizer { + constructor(tokenizerJSON, tokenizerConfig) { + super(tokenizerJSON, tokenizerConfig); - constructor(tokenizerJSON, tokenizerConfig) { - super(tokenizerJSON, tokenizerConfig); - - // Custom decoder function - this.decoder = new VitsDecoder({}); - } + // Custom decoder function + this.decoder = new VitsDecoder({}); + } } /** * Helper class which is used to instantiate pretrained tokenizers with the `from_pretrained` function. * The chosen tokenizer class is determined by the type specified in the tokenizer config. - * + * * @example * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/bert-base-uncased'); */ export class AutoTokenizer { - static TOKENIZER_CLASS_MAPPING = { - T5Tokenizer, - DistilBertTokenizer, - CamembertTokenizer, - DebertaTokenizer, - DebertaV2Tokenizer, - BertTokenizer, - HerbertTokenizer, - ConvBertTokenizer, - RoFormerTokenizer, - XLMTokenizer, - ElectraTokenizer, - MobileBertTokenizer, - SqueezeBertTokenizer, - AlbertTokenizer, - GPT2Tokenizer, - BartTokenizer, - MBartTokenizer, - MBart50Tokenizer, - RobertaTokenizer, - WhisperTokenizer, - CodeGenTokenizer, - CLIPTokenizer, - SiglipTokenizer, - MarianTokenizer, - BloomTokenizer, - NllbTokenizer, - M2M100Tokenizer, - LlamaTokenizer, - CodeLlamaTokenizer, - XLMRobertaTokenizer, - MPNetTokenizer, - FalconTokenizer, - GPTNeoXTokenizer, - EsmTokenizer, - Wav2Vec2CTCTokenizer, - BlenderbotTokenizer, - BlenderbotSmallTokenizer, - SpeechT5Tokenizer, - NougatTokenizer, - VitsTokenizer, + static TOKENIZER_CLASS_MAPPING = { + T5Tokenizer, + DistilBertTokenizer, + CamembertTokenizer, + DebertaTokenizer, + DebertaV2Tokenizer, + BertTokenizer, + HerbertTokenizer, + ConvBertTokenizer, + RoFormerTokenizer, + XLMTokenizer, + ElectraTokenizer, + MobileBertTokenizer, + SqueezeBertTokenizer, + AlbertTokenizer, + GPT2Tokenizer, + BartTokenizer, + MBartTokenizer, + MBart50Tokenizer, + RobertaTokenizer, + WhisperTokenizer, + CodeGenTokenizer, + CLIPTokenizer, + SiglipTokenizer, + MarianTokenizer, + BloomTokenizer, + NllbTokenizer, + M2M100Tokenizer, + LlamaTokenizer, + CodeLlamaTokenizer, + XLMRobertaTokenizer, + MPNetTokenizer, + FalconTokenizer, + GPTNeoXTokenizer, + EsmTokenizer, + Wav2Vec2CTCTokenizer, + BlenderbotTokenizer, + BlenderbotSmallTokenizer, + SpeechT5Tokenizer, + NougatTokenizer, + VitsTokenizer, - // Base case: - PreTrainedTokenizer, - } - - - /** - * Instantiate one of the tokenizer classes of the library from a pretrained model. - * - * The tokenizer class to instantiate is selected based on the `tokenizer_class` property of the config object - * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) - * - * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: - * - A string, the *model id* of a pretrained tokenizer hosted inside a model repo on huggingface.co. - * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a - * user or organization name, like `dbmdz/bert-base-german-cased`. - * - A path to a *directory* containing tokenizer files, e.g., `./my_model_directory/`. - * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. - * - * @returns {Promise} A new instance of the PreTrainedTokenizer class. - */ - static async from_pretrained(pretrained_model_name_or_path, { - quantized = true, - progress_callback = null, - config = null, - cache_dir = null, - local_files_only = false, - revision = 'main', - legacy = null, - } = {}) { - - const [tokenizerJSON, tokenizerConfig] = await loadTokenizer(pretrained_model_name_or_path, { - quantized, - progress_callback, - config, - cache_dir, - local_files_only, - revision, - legacy, - }) - - // Some tokenizers are saved with the "Fast" suffix, so we remove that if present. - const tokenizerName = tokenizerConfig.tokenizer_class?.replace(/Fast$/, '') ?? 'PreTrainedTokenizer'; - - let cls = this.TOKENIZER_CLASS_MAPPING[tokenizerName]; - if (!cls) { - console.warn(`Unknown tokenizer class "${tokenizerName}", attempting to construct from base class.`); - cls = PreTrainedTokenizer; - } - return new cls(tokenizerJSON, tokenizerConfig); + // Base case: + PreTrainedTokenizer, + }; + + /** + * Instantiate one of the tokenizer classes of the library from a pretrained model. + * + * The tokenizer class to instantiate is selected based on the `tokenizer_class` property of the config object + * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) + * + * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: + * - A string, the *model id* of a pretrained tokenizer hosted inside a model repo on huggingface.co. + * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + * user or organization name, like `dbmdz/bert-base-german-cased`. + * - A path to a *directory* containing tokenizer files, e.g., `./my_model_directory/`. + * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. + * + * @returns {Promise} A new instance of the PreTrainedTokenizer class. + */ + static async from_pretrained( + pretrained_model_name_or_path, + { + quantized = true, + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = "main", + legacy = null, + } = {}, + ) { + const [tokenizerJSON, tokenizerConfig] = await loadTokenizer( + pretrained_model_name_or_path, + { + quantized, + progress_callback, + config, + cache_dir, + local_files_only, + revision, + legacy, + }, + ); + + // Some tokenizers are saved with the "Fast" suffix, so we remove that if present. + const tokenizerName = + tokenizerConfig.tokenizer_class?.replace(/Fast$/, "") ?? + "PreTrainedTokenizer"; + + let cls = this.TOKENIZER_CLASS_MAPPING[tokenizerName]; + if (!cls) { + console.warn( + `Unknown tokenizer class "${tokenizerName}", attempting to construct from base class.`, + ); + cls = PreTrainedTokenizer; } + return new cls(tokenizerJSON, tokenizerConfig); + } } diff --git a/core/vendor/modules/@xenova/transformers/src/transformers.js b/core/vendor/modules/@xenova/transformers/src/transformers.js index acce33784..14a77700c 100644 --- a/core/vendor/modules/@xenova/transformers/src/transformers.js +++ b/core/vendor/modules/@xenova/transformers/src/transformers.js @@ -1,18 +1,18 @@ /** * @file Entry point for the Transformers.js library. Only the exports from this file * are available to the end user, and are grouped as follows: - * + * * 1. [Pipelines](./pipelines) * 2. [Environment variables](./env) * 3. [Models](./models) * 4. [Tokenizers](./tokenizers) * 5. [Processors](./processors) - * + * * @module transformers */ -export * from './env.js'; -export * from './pipelines.js'; +export * from "./env.js"; +export * from "./pipelines.js"; // export * from './models.js'; // export * from './tokenizers.js'; // export * from './processors.js'; diff --git a/core/vendor/modules/@xenova/transformers/src/utils/audio.js b/core/vendor/modules/@xenova/transformers/src/utils/audio.js index 082870de8..5e42a60ea 100644 --- a/core/vendor/modules/@xenova/transformers/src/utils/audio.js +++ b/core/vendor/modules/@xenova/transformers/src/utils/audio.js @@ -1,20 +1,15 @@ /** - * @file Helper module for audio processing. - * - * These functions and classes are only used internally, + * @file Helper module for audio processing. + * + * These functions and classes are only used internally, * meaning an end-user shouldn't need to access anything here. - * + * * @module utils/audio */ -import { - getFile, -} from './hub.js'; -import { FFT, max } from './maths.js'; -import { - calculateReflectOffset, -} from './core.js'; - +import { getFile } from "./hub.js"; +import { FFT, max } from "./maths.js"; +import { calculateReflectOffset } from "./core.js"; /** * Helper function to read audio from a path/URL. @@ -23,58 +18,59 @@ import { * @returns {Promise} The decoded audio as a `Float32Array`. */ export async function read_audio(url, sampling_rate) { - if (typeof AudioContext === 'undefined') { - // Running in node or an environment without AudioContext - throw Error( - "Unable to load audio from path/URL since `AudioContext` is not available in your environment. " + - "Instead, audio data should be passed directly to the pipeline/processor. " + - "For more information and some example code, see https://huggingface.co/docs/transformers.js/guides/node-audio-processing." - ) + if (typeof AudioContext === "undefined") { + // Running in node or an environment without AudioContext + throw Error( + "Unable to load audio from path/URL since `AudioContext` is not available in your environment. " + + "Instead, audio data should be passed directly to the pipeline/processor. " + + "For more information and some example code, see https://huggingface.co/docs/transformers.js/guides/node-audio-processing.", + ); + } + + const response = await (await getFile(url)).arrayBuffer(); + const audioCTX = new AudioContext({ sampleRate: sampling_rate }); + if (typeof sampling_rate === "undefined") { + console.warn( + `No sampling rate provided, using default of ${audioCTX.sampleRate}Hz.`, + ); + } + const decoded = await audioCTX.decodeAudioData(response); + + /** @type {Float32Array} */ + let audio; + + // We now replicate HuggingFace's `ffmpeg_read` method: + if (decoded.numberOfChannels === 2) { + // When downmixing a stereo audio file to mono using the -ac 1 option in FFmpeg, + // the audio signal is summed across both channels to create a single mono channel. + // However, if the audio is at full scale (i.e. the highest possible volume level), + // the summing of the two channels can cause the audio signal to clip or distort. + + // To prevent this clipping, FFmpeg applies a scaling factor of 1/sqrt(2) (~ 0.707) + // to the audio signal before summing the two channels. This scaling factor ensures + // that the combined audio signal will not exceed the maximum possible level, even + // if both channels are at full scale. + + // After applying this scaling factor, the audio signal from both channels is summed + // to create a single mono channel. It's worth noting that this scaling factor is + // only applied when downmixing stereo audio to mono using the -ac 1 option in FFmpeg. + // If you're using a different downmixing method, or if you're not downmixing the + // audio at all, this scaling factor may not be needed. + const SCALING_FACTOR = Math.sqrt(2); + + const left = decoded.getChannelData(0); + const right = decoded.getChannelData(1); + + audio = new Float32Array(left.length); + for (let i = 0; i < decoded.length; ++i) { + audio[i] = (SCALING_FACTOR * (left[i] + right[i])) / 2; } + } else { + // If the audio is not stereo, we can just use the first channel: + audio = decoded.getChannelData(0); + } - const response = await (await getFile(url)).arrayBuffer(); - const audioCTX = new AudioContext({ sampleRate: sampling_rate }); - if (typeof sampling_rate === 'undefined') { - console.warn(`No sampling rate provided, using default of ${audioCTX.sampleRate}Hz.`) - } - const decoded = await audioCTX.decodeAudioData(response); - - /** @type {Float32Array} */ - let audio; - - // We now replicate HuggingFace's `ffmpeg_read` method: - if (decoded.numberOfChannels === 2) { - // When downmixing a stereo audio file to mono using the -ac 1 option in FFmpeg, - // the audio signal is summed across both channels to create a single mono channel. - // However, if the audio is at full scale (i.e. the highest possible volume level), - // the summing of the two channels can cause the audio signal to clip or distort. - - // To prevent this clipping, FFmpeg applies a scaling factor of 1/sqrt(2) (~ 0.707) - // to the audio signal before summing the two channels. This scaling factor ensures - // that the combined audio signal will not exceed the maximum possible level, even - // if both channels are at full scale. - - // After applying this scaling factor, the audio signal from both channels is summed - // to create a single mono channel. It's worth noting that this scaling factor is - // only applied when downmixing stereo audio to mono using the -ac 1 option in FFmpeg. - // If you're using a different downmixing method, or if you're not downmixing the - // audio at all, this scaling factor may not be needed. - const SCALING_FACTOR = Math.sqrt(2); - - const left = decoded.getChannelData(0); - const right = decoded.getChannelData(1); - - audio = new Float32Array(left.length); - for (let i = 0; i < decoded.length; ++i) { - audio[i] = SCALING_FACTOR * (left[i] + right[i]) / 2; - } - - } else { - // If the audio is not stereo, we can just use the first channel: - audio = decoded.getChannelData(0); - } - - return audio; + return audio; } /** @@ -84,107 +80,127 @@ export async function read_audio(url, sampling_rate) { * @returns {Float64Array} The generated Hanning window. */ export function hanning(M) { - if (M < 1) { - return new Float64Array(); - } - if (M === 1) { - return new Float64Array([1]); - } - const denom = M - 1; - const factor = Math.PI / denom; - const cos_vals = new Float64Array(M); - for (let i = 0; i < M; ++i) { - const n = 2 * i - denom; - cos_vals[i] = 0.5 + 0.5 * Math.cos(factor * n); - } - return cos_vals; + if (M < 1) { + return new Float64Array(); + } + if (M === 1) { + return new Float64Array([1]); + } + const denom = M - 1; + const factor = Math.PI / denom; + const cos_vals = new Float64Array(M); + for (let i = 0; i < M; ++i) { + const n = 2 * i - denom; + cos_vals[i] = 0.5 + 0.5 * Math.cos(factor * n); + } + return cos_vals; } const HERTZ_TO_MEL_MAPPING = { - "htk": (/** @type {number} */ freq) => 2595.0 * Math.log10(1.0 + (freq / 700.0)), - "kaldi": (/** @type {number} */ freq) => 1127.0 * Math.log(1.0 + (freq / 700.0)), - "slaney": (/** @type {number} */ freq, min_log_hertz = 1000.0, min_log_mel = 15.0, logstep = 27.0 / Math.log(6.4)) => - freq >= min_log_hertz - ? min_log_mel + Math.log(freq / min_log_hertz) * logstep - : 3.0 * freq / 200.0, -} + htk: (/** @type {number} */ freq) => 2595.0 * Math.log10(1.0 + freq / 700.0), + kaldi: (/** @type {number} */ freq) => 1127.0 * Math.log(1.0 + freq / 700.0), + slaney: ( + /** @type {number} */ freq, + min_log_hertz = 1000.0, + min_log_mel = 15.0, + logstep = 27.0 / Math.log(6.4), + ) => + freq >= min_log_hertz + ? min_log_mel + Math.log(freq / min_log_hertz) * logstep + : (3.0 * freq) / 200.0, +}; /** - * @template {Float32Array|Float64Array|number} T - * @param {T} freq + * @template {Float32Array|Float64Array|number} T + * @param {T} freq * @param {string} [mel_scale] * @returns {T} */ function hertz_to_mel(freq, mel_scale = "htk") { - const fn = HERTZ_TO_MEL_MAPPING[mel_scale]; - if (!fn) { - throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".'); - } + const fn = HERTZ_TO_MEL_MAPPING[mel_scale]; + if (!fn) { + throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".'); + } - return typeof freq === 'number' ? fn(freq) : freq.map(x => fn(x)); + return typeof freq === "number" ? fn(freq) : freq.map((x) => fn(x)); } const MEL_TO_HERTZ_MAPPING = { - "htk": (/** @type {number} */ mels) => 700.0 * (10.0 ** (mels / 2595.0) - 1.0), - "kaldi": (/** @type {number} */ mels) => 700.0 * (Math.exp(mels / 1127.0) - 1.0), - "slaney": (/** @type {number} */ mels, min_log_hertz = 1000.0, min_log_mel = 15.0, logstep = Math.log(6.4) / 27.0) => mels >= min_log_mel - ? min_log_hertz * Math.exp(logstep * (mels - min_log_mel)) - : 200.0 * mels / 3.0, -} + htk: (/** @type {number} */ mels) => 700.0 * (10.0 ** (mels / 2595.0) - 1.0), + kaldi: (/** @type {number} */ mels) => + 700.0 * (Math.exp(mels / 1127.0) - 1.0), + slaney: ( + /** @type {number} */ mels, + min_log_hertz = 1000.0, + min_log_mel = 15.0, + logstep = Math.log(6.4) / 27.0, + ) => + mels >= min_log_mel + ? min_log_hertz * Math.exp(logstep * (mels - min_log_mel)) + : (200.0 * mels) / 3.0, +}; /** - * @template {Float32Array|Float64Array|number} T - * @param {T} mels + * @template {Float32Array|Float64Array|number} T + * @param {T} mels * @param {string} [mel_scale] * @returns {T} */ function mel_to_hertz(mels, mel_scale = "htk") { - const fn = MEL_TO_HERTZ_MAPPING[mel_scale]; - if (!fn) { - throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".'); - } + const fn = MEL_TO_HERTZ_MAPPING[mel_scale]; + if (!fn) { + throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".'); + } - return typeof mels === 'number' ? fn(mels) : mels.map(x => fn(x)); + return typeof mels === "number" ? fn(mels) : mels.map((x) => fn(x)); } /** -* Creates a triangular filter bank. -* -* Adapted from torchaudio and librosa. -* -* @param {Float64Array} fft_freqs Discrete frequencies of the FFT bins in Hz, of shape `(num_frequency_bins,)`. -* @param {Float64Array} filter_freqs Center frequencies of the triangular filters to create, in Hz, of shape `(num_mel_filters,)`. -* @returns {number[][]} of shape `(num_frequency_bins, num_mel_filters)`. -*/ + * Creates a triangular filter bank. + * + * Adapted from torchaudio and librosa. + * + * @param {Float64Array} fft_freqs Discrete frequencies of the FFT bins in Hz, of shape `(num_frequency_bins,)`. + * @param {Float64Array} filter_freqs Center frequencies of the triangular filters to create, in Hz, of shape `(num_mel_filters,)`. + * @returns {number[][]} of shape `(num_frequency_bins, num_mel_filters)`. + */ function _create_triangular_filter_bank(fft_freqs, filter_freqs) { - const filter_diff = Float64Array.from( - { length: filter_freqs.length - 1 }, - (_, i) => filter_freqs[i + 1] - filter_freqs[i] - ); + const filter_diff = Float64Array.from( + { length: filter_freqs.length - 1 }, + (_, i) => filter_freqs[i + 1] - filter_freqs[i], + ); - const slopes = Array.from({ - length: fft_freqs.length - }, () => new Array(filter_freqs.length)); + const slopes = Array.from( + { + length: fft_freqs.length, + }, + () => new Array(filter_freqs.length), + ); - for (let j = 0; j < fft_freqs.length; ++j) { - const slope = slopes[j]; - for (let i = 0; i < filter_freqs.length; ++i) { - slope[i] = filter_freqs[i] - fft_freqs[j]; - } + for (let j = 0; j < fft_freqs.length; ++j) { + const slope = slopes[j]; + for (let i = 0; i < filter_freqs.length; ++i) { + slope[i] = filter_freqs[i] - fft_freqs[j]; } + } - const numFreqs = filter_freqs.length - 2; - const ret = Array.from({ length: numFreqs }, () => new Array(fft_freqs.length)); + const numFreqs = filter_freqs.length - 2; + const ret = Array.from( + { length: numFreqs }, + () => new Array(fft_freqs.length), + ); - for (let j = 0; j < fft_freqs.length; ++j) { // 201 - const slope = slopes[j]; - for (let i = 0; i < numFreqs; ++i) { // 80 - const down = -slope[i] / filter_diff[i]; - const up = slope[i + 2] / filter_diff[i + 1]; - ret[i][j] = Math.max(0, Math.min(down, up)); - } + for (let j = 0; j < fft_freqs.length; ++j) { + // 201 + const slope = slopes[j]; + for (let i = 0; i < numFreqs; ++i) { + // 80 + const down = -slope[i] / filter_diff[i]; + const up = slope[i + 2] / filter_diff[i + 1]; + ret[i][j] = Math.max(0, Math.min(down, up)); } - return ret; + } + return ret; } /** @@ -195,8 +211,8 @@ function _create_triangular_filter_bank(fft_freqs, filter_freqs) { * @returns `num` evenly spaced samples, calculated over the interval `[start, stop]`. */ function linspace(start, end, num) { - const step = (end - start) / (num - 1); - return Float64Array.from({ length: num }, (_, i) => start + step * i); + const step = (end - start) / (num - 1); + return Float64Array.from({ length: num }, (_, i) => start + step * i); } /** @@ -217,52 +233,57 @@ function linspace(start, end, num) { * This is a projection matrix to go from a spectrogram to a mel spectrogram. */ export function mel_filter_bank( - num_frequency_bins, - num_mel_filters, - min_frequency, - max_frequency, - sampling_rate, - norm = null, - mel_scale = "htk", - triangularize_in_mel_space = false, + num_frequency_bins, + num_mel_filters, + min_frequency, + max_frequency, + sampling_rate, + norm = null, + mel_scale = "htk", + triangularize_in_mel_space = false, ) { - if (norm !== null && norm !== "slaney") { - throw new Error('norm must be one of null or "slaney"'); + if (norm !== null && norm !== "slaney") { + throw new Error('norm must be one of null or "slaney"'); + } + + const mel_min = hertz_to_mel(min_frequency, mel_scale); + const mel_max = hertz_to_mel(max_frequency, mel_scale); + const mel_freqs = linspace(mel_min, mel_max, num_mel_filters + 2); + + let filter_freqs = mel_to_hertz(mel_freqs, mel_scale); + let fft_freqs; // frequencies of FFT bins in Hz + + if (triangularize_in_mel_space) { + const fft_bin_width = sampling_rate / (num_frequency_bins * 2); + fft_freqs = hertz_to_mel( + Float64Array.from( + { length: num_frequency_bins }, + (_, i) => i * fft_bin_width, + ), + mel_scale, + ); + filter_freqs = mel_freqs; + } else { + fft_freqs = linspace(0, Math.floor(sampling_rate / 2), num_frequency_bins); + } + + const mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs); + + if (norm !== null && norm === "slaney") { + // Slaney-style mel is scaled to be approx constant energy per channel + for (let i = 0; i < num_mel_filters; ++i) { + const filter = mel_filters[i]; + const enorm = 2.0 / (filter_freqs[i + 2] - filter_freqs[i]); + for (let j = 0; j < num_frequency_bins; ++j) { + // Apply this enorm to all frequency bins + filter[j] *= enorm; + } } + } - const mel_min = hertz_to_mel(min_frequency, mel_scale); - const mel_max = hertz_to_mel(max_frequency, mel_scale); - const mel_freqs = linspace(mel_min, mel_max, num_mel_filters + 2); - - let filter_freqs = mel_to_hertz(mel_freqs, mel_scale); - let fft_freqs; // frequencies of FFT bins in Hz - - if (triangularize_in_mel_space) { - const fft_bin_width = sampling_rate / (num_frequency_bins * 2); - fft_freqs = hertz_to_mel(Float64Array.from({ length: num_frequency_bins }, (_, i) => i * fft_bin_width), mel_scale); - filter_freqs = mel_freqs; - } else { - fft_freqs = linspace(0, Math.floor(sampling_rate / 2), num_frequency_bins); - } - - const mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs); - - if (norm !== null && norm === "slaney") { - // Slaney-style mel is scaled to be approx constant energy per channel - for (let i = 0; i < num_mel_filters; ++i) { - const filter = mel_filters[i]; - const enorm = 2.0 / (filter_freqs[i + 2] - filter_freqs[i]); - for (let j = 0; j < num_frequency_bins; ++j) { - // Apply this enorm to all frequency bins - filter[j] *= enorm; - } - } - } - - // TODO warn if there is a zero row - - return mel_filters; + // TODO warn if there is a zero row + return mel_filters; } /** @@ -274,73 +295,80 @@ export function mel_filter_bank( * @returns {T} The padded array. */ function padReflect(array, left, right) { - // @ts-ignore - const padded = new array.constructor(array.length + left + right); - const w = array.length - 1; + // @ts-ignore + const padded = new array.constructor(array.length + left + right); + const w = array.length - 1; - for (let i = 0; i < array.length; ++i) { - padded[left + i] = array[i]; - } + for (let i = 0; i < array.length; ++i) { + padded[left + i] = array[i]; + } - for (let i = 1; i <= left; ++i) { - padded[left - i] = array[calculateReflectOffset(i, w)]; - } + for (let i = 1; i <= left; ++i) { + padded[left - i] = array[calculateReflectOffset(i, w)]; + } - for (let i = 1; i <= right; ++i) { - padded[w + left + i] = array[calculateReflectOffset(w - i, w)]; - } + for (let i = 1; i <= right; ++i) { + padded[w + left + i] = array[calculateReflectOffset(w - i, w)]; + } - return padded; + return padded; } /** * Helper function to compute `amplitude_to_db` and `power_to_db`. * @template {Float32Array|Float64Array} T - * @param {T} spectrogram - * @param {number} factor - * @param {number} reference - * @param {number} min_value - * @param {number} db_range + * @param {T} spectrogram + * @param {number} factor + * @param {number} reference + * @param {number} min_value + * @param {number} db_range * @returns {T} */ -function _db_conversion_helper(spectrogram, factor, reference, min_value, db_range) { - if (reference <= 0) { - throw new Error('reference must be greater than zero'); +function _db_conversion_helper( + spectrogram, + factor, + reference, + min_value, + db_range, +) { + if (reference <= 0) { + throw new Error("reference must be greater than zero"); + } + + if (min_value <= 0) { + throw new Error("min_value must be greater than zero"); + } + + reference = Math.max(min_value, reference); + + const logReference = Math.log10(reference); + for (let i = 0; i < spectrogram.length; ++i) { + spectrogram[i] = + factor * Math.log10(Math.max(min_value, spectrogram[i]) - logReference); + } + + if (db_range !== null) { + if (db_range <= 0) { + throw new Error("db_range must be greater than zero"); } - - if (min_value <= 0) { - throw new Error('min_value must be greater than zero'); - } - - reference = Math.max(min_value, reference); - - const logReference = Math.log10(reference); + const maxValue = max(spectrogram)[0] - db_range; for (let i = 0; i < spectrogram.length; ++i) { - spectrogram[i] = factor * Math.log10(Math.max(min_value, spectrogram[i]) - logReference) + spectrogram[i] = Math.max(spectrogram[i], maxValue); } + } - if (db_range !== null) { - if (db_range <= 0) { - throw new Error('db_range must be greater than zero'); - } - const maxValue = max(spectrogram)[0] - db_range; - for (let i = 0; i < spectrogram.length; ++i) { - spectrogram[i] = Math.max(spectrogram[i], maxValue); - } - } - - return spectrogram; + return spectrogram; } /** * Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, * using basic logarithm properties for numerical stability. NOTE: Operates in-place. - * + * * The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a * linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. * This means that large variations in energy may not sound all that different if the sound is loud to begin with. * This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. - * + * * @template {Float32Array|Float64Array} T * @param {T} spectrogram The input amplitude (mel) spectrogram. * @param {number} [reference=1.0] Sets the input spectrogram value that corresponds to 0 dB. @@ -351,21 +379,32 @@ function _db_conversion_helper(spectrogram, factor, reference, min_value, db_ran * difference between the peak value and the smallest value will never be more than 80 dB. Must be greater than zero. * @returns {T} The modified spectrogram in decibels. */ -function amplitude_to_db(spectrogram, reference = 1.0, min_value = 1e-5, db_range = null) { - return _db_conversion_helper(spectrogram, 20.0, reference, min_value, db_range); +function amplitude_to_db( + spectrogram, + reference = 1.0, + min_value = 1e-5, + db_range = null, +) { + return _db_conversion_helper( + spectrogram, + 20.0, + reference, + min_value, + db_range, + ); } /** * Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, * using basic logarithm properties for numerical stability. NOTE: Operates in-place. - * + * * The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a * linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. * This means that large variations in energy may not sound all that different if the sound is loud to begin with. * This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. - * + * * Based on the implementation of `librosa.power_to_db`. - * + * * @template {Float32Array|Float64Array} T * @param {T} spectrogram The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared! * @param {number} [reference=1.0] Sets the input spectrogram value that corresponds to 0 dB. @@ -376,13 +415,24 @@ function amplitude_to_db(spectrogram, reference = 1.0, min_value = 1e-5, db_rang * difference between the peak value and the smallest value will never be more than 80 dB. Must be greater than zero. * @returns {T} The modified spectrogram in decibels. */ -function power_to_db(spectrogram, reference = 1.0, min_value = 1e-10, db_range = null) { - return _db_conversion_helper(spectrogram, 10.0, reference, min_value, db_range); +function power_to_db( + spectrogram, + reference = 1.0, + min_value = 1e-10, + db_range = null, +) { + return _db_conversion_helper( + spectrogram, + 10.0, + reference, + min_value, + db_range, + ); } /** * Calculates a spectrogram over one waveform using the Short-Time Fourier Transform. - * + * * This function can create the following kinds of spectrograms: * - amplitude spectrogram (`power = 1.0`) * - power spectrogram (`power = 2.0`) @@ -392,9 +442,9 @@ function power_to_db(spectrogram, reference = 1.0, min_value = 1e-10, db_range = * - log-mel spectrogram (provide `mel_filters` and `log_mel`) * * In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. - * A padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, + * A padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, * typically the next power of two. - * + * * @param {Float32Array|Float64Array} waveform The input waveform of shape `(length,)`. This must be a single real-valued, mono waveform. * @param {Float32Array|Float64Array} window The windowing function to apply of shape `(frame_length,)`, including zero-padding if necessary. The actual window length may be * shorter than `frame_length`, but we're assuming the array has already been zero-padded. @@ -432,193 +482,209 @@ function power_to_db(spectrogram, reference = 1.0, min_value = 1e-10, db_range = * @returns {{data: Float32Array, dims: number[]}} Spectrogram of shape `(num_frequency_bins, length)` (regular spectrogram) or shape `(num_mel_filters, length)` (mel spectrogram). */ export function spectrogram( - waveform, - window, - frame_length, - hop_length, - { - fft_length = null, - power = 1.0, - center = true, - pad_mode = "reflect", - onesided = true, - preemphasis = null, - mel_filters = null, - mel_floor = 1e-10, - log_mel = null, - reference = 1.0, - min_value = 1e-10, - db_range = null, - remove_dc_offset = null, + waveform, + window, + frame_length, + hop_length, + { + fft_length = null, + power = 1.0, + center = true, + pad_mode = "reflect", + onesided = true, + preemphasis = null, + mel_filters = null, + mel_floor = 1e-10, + log_mel = null, + reference = 1.0, + min_value = 1e-10, + db_range = null, + remove_dc_offset = null, - // Custom parameters for efficiency reasons - max_num_frames = null, - do_pad = true, - transpose = false, - } = {} + // Custom parameters for efficiency reasons + max_num_frames = null, + do_pad = true, + transpose = false, + } = {}, ) { - const window_length = window.length; - if (fft_length === null) { - fft_length = frame_length; + const window_length = window.length; + if (fft_length === null) { + fft_length = frame_length; + } + if (frame_length > fft_length) { + throw Error( + `frame_length (${frame_length}) may not be larger than fft_length (${fft_length})`, + ); + } + + if (window_length !== frame_length) { + throw new Error( + `Length of the window (${window_length}) must equal frame_length (${frame_length})`, + ); + } + + if (hop_length <= 0) { + throw new Error("hop_length must be greater than zero"); + } + + if (center) { + if (pad_mode !== "reflect") { + throw new Error(`pad_mode="${pad_mode}" not implemented yet.`); } - if (frame_length > fft_length) { - throw Error(`frame_length (${frame_length}) may not be larger than fft_length (${fft_length})`) + const half_window = Math.floor((fft_length - 1) / 2) + 1; + waveform = padReflect(waveform, half_window, half_window); + } + + // split waveform into frames of frame_length size + const num_frames = Math.floor( + 1 + Math.floor((waveform.length - frame_length) / hop_length), + ); + + const num_frequency_bins = onesided + ? Math.floor(fft_length / 2) + 1 + : fft_length; + + let d1 = num_frames; + let d1Max = num_frames; + + // If maximum number of frames is provided, we must either pad or truncate + if (max_num_frames !== null) { + if (max_num_frames > num_frames) { + // input is too short, so we pad + if (do_pad) { + d1Max = max_num_frames; + } + } else { + // input is too long, so we truncate + d1Max = d1 = max_num_frames; + } + } + + // Preallocate arrays to store output. + const fft = new FFT(fft_length); + const inputBuffer = new Float64Array(fft_length); + const outputBuffer = new Float64Array(fft.outputBufferSize); + const magnitudes = new Array(d1); + + for (let i = 0; i < d1; ++i) { + // Populate buffer with waveform data + const offset = i * hop_length; + for (let j = 0; j < frame_length; ++j) { + inputBuffer[j] = waveform[offset + j]; } - if (window_length !== frame_length) { - throw new Error(`Length of the window (${window_length}) must equal frame_length (${frame_length})`); + if (remove_dc_offset) { + let sum = 0; + for (let j = 0; j < frame_length; ++j) { + sum += inputBuffer[j]; + } + const mean = sum / frame_length; + for (let j = 0; j < frame_length; ++j) { + inputBuffer[j] -= mean; + } } - if (hop_length <= 0) { - throw new Error("hop_length must be greater than zero"); + if (preemphasis !== null) { + // Done in reverse to avoid copies and distructive modification + for (let j = frame_length - 1; j >= 1; --j) { + inputBuffer[j] -= preemphasis * inputBuffer[j - 1]; + } + inputBuffer[0] *= 1 - preemphasis; } - if (center) { - if (pad_mode !== 'reflect') { - throw new Error(`pad_mode="${pad_mode}" not implemented yet.`) - } - const half_window = Math.floor((fft_length - 1) / 2) + 1; - waveform = padReflect(waveform, half_window, half_window); + for (let j = 0; j < window.length; ++j) { + inputBuffer[j] *= window[j]; } - // split waveform into frames of frame_length size - const num_frames = Math.floor(1 + Math.floor((waveform.length - frame_length) / hop_length)) + fft.realTransform(outputBuffer, inputBuffer); - const num_frequency_bins = onesided ? Math.floor(fft_length / 2) + 1 : fft_length - - let d1 = num_frames; - let d1Max = num_frames; - - // If maximum number of frames is provided, we must either pad or truncate - if (max_num_frames !== null) { - if (max_num_frames > num_frames) { // input is too short, so we pad - if (do_pad) { - d1Max = max_num_frames; - } - } else { // input is too long, so we truncate - d1Max = d1 = max_num_frames; - } + // compute magnitudes + const row = new Array(num_frequency_bins); + for (let j = 0; j < row.length; ++j) { + const j2 = j << 1; + row[j] = outputBuffer[j2] ** 2 + outputBuffer[j2 + 1] ** 2; } + magnitudes[i] = row; + } - // Preallocate arrays to store output. - const fft = new FFT(fft_length); - const inputBuffer = new Float64Array(fft_length); - const outputBuffer = new Float64Array(fft.outputBufferSize); - const magnitudes = new Array(d1); - - for (let i = 0; i < d1; ++i) { - // Populate buffer with waveform data - const offset = i * hop_length; - for (let j = 0; j < frame_length; ++j) { - inputBuffer[j] = waveform[offset + j]; - } - - if (remove_dc_offset) { - let sum = 0; - for (let j = 0; j < frame_length; ++j) { - sum += inputBuffer[j]; - } - const mean = sum / frame_length; - for (let j = 0; j < frame_length; ++j) { - inputBuffer[j] -= mean; - } - } - - if (preemphasis !== null) { - // Done in reverse to avoid copies and distructive modification - for (let j = frame_length - 1; j >= 1; --j) { - inputBuffer[j] -= preemphasis * inputBuffer[j - 1]; - } - inputBuffer[0] *= 1 - preemphasis; - } - - for (let j = 0; j < window.length; ++j) { - inputBuffer[j] *= window[j]; - } - - fft.realTransform(outputBuffer, inputBuffer); - - // compute magnitudes - const row = new Array(num_frequency_bins); - for (let j = 0; j < row.length; ++j) { - const j2 = j << 1; - row[j] = outputBuffer[j2] ** 2 + outputBuffer[j2 + 1] ** 2; - } - magnitudes[i] = row; + // TODO what should happen if power is None? + // https://github.com/huggingface/transformers/issues/27772 + if (power !== null && power !== 2) { + // slight optimization to not sqrt + const pow = 2 / power; // we use 2 since we already squared + for (let i = 0; i < magnitudes.length; ++i) { + const magnitude = magnitudes[i]; + for (let j = 0; j < magnitude.length; ++j) { + magnitude[j] **= pow; + } } + } - // TODO what should happen if power is None? - // https://github.com/huggingface/transformers/issues/27772 - if (power !== null && power !== 2) { - // slight optimization to not sqrt - const pow = 2 / power; // we use 2 since we already squared - for (let i = 0; i < magnitudes.length; ++i) { - const magnitude = magnitudes[i]; - for (let j = 0; j < magnitude.length; ++j) { - magnitude[j] **= pow; - } - } + // TODO: What if `mel_filters` is null? + const num_mel_filters = mel_filters.length; + + // Only here do we create Float32Array + const mel_spec = new Float32Array(num_mel_filters * d1Max); + + // Perform matrix muliplication: + // mel_spec = mel_filters @ magnitudes.T + // - mel_filters.shape=(80, 201) + // - magnitudes.shape=(3000, 201) => - magnitudes.T.shape=(201, 3000) + // - mel_spec.shape=(80, 3000) + const dims = transpose ? [d1Max, num_mel_filters] : [num_mel_filters, d1Max]; + for (let i = 0; i < num_mel_filters; ++i) { + // num melfilters (e.g., 80) + const filter = mel_filters[i]; + for (let j = 0; j < d1; ++j) { + // num frames (e.g., 3000) + const magnitude = magnitudes[j]; + + let sum = 0; + for (let k = 0; k < num_frequency_bins; ++k) { + // num frequency bins (e.g., 201) + sum += filter[k] * magnitude[k]; + } + + mel_spec[transpose ? j * num_mel_filters + i : i * d1 + j] = Math.max( + mel_floor, + sum, + ); } + } - // TODO: What if `mel_filters` is null? - const num_mel_filters = mel_filters.length; - - // Only here do we create Float32Array - const mel_spec = new Float32Array(num_mel_filters * d1Max); - - // Perform matrix muliplication: - // mel_spec = mel_filters @ magnitudes.T - // - mel_filters.shape=(80, 201) - // - magnitudes.shape=(3000, 201) => - magnitudes.T.shape=(201, 3000) - // - mel_spec.shape=(80, 3000) - const dims = transpose ? [d1Max, num_mel_filters] : [num_mel_filters, d1Max]; - for (let i = 0; i < num_mel_filters; ++i) { // num melfilters (e.g., 80) - const filter = mel_filters[i]; - for (let j = 0; j < d1; ++j) { // num frames (e.g., 3000) - const magnitude = magnitudes[j]; - - let sum = 0; - for (let k = 0; k < num_frequency_bins; ++k) { // num frequency bins (e.g., 201) - sum += filter[k] * magnitude[k]; - } - - mel_spec[ - transpose - ? j * num_mel_filters + i - : i * d1 + j - ] = Math.max(mel_floor, sum); + if (power !== null && log_mel !== null) { + const o = Math.min(mel_spec.length, d1 * num_mel_filters); + switch (log_mel) { + case "log": + for (let i = 0; i < o; ++i) { + mel_spec[i] = Math.log(mel_spec[i]); } - } - - if (power !== null && log_mel !== null) { - const o = Math.min(mel_spec.length, d1 * num_mel_filters); - switch (log_mel) { - case 'log': - for (let i = 0; i < o; ++i) { - mel_spec[i] = Math.log(mel_spec[i]); - } - break; - case 'log10': - for (let i = 0; i < o; ++i) { - mel_spec[i] = Math.log10(mel_spec[i]); - } - break; - case 'dB': - if (power === 1.0) { - // NOTE: operates in-place - amplitude_to_db(mel_spec, reference, min_value, db_range); - } else if (power === 2.0) { - power_to_db(mel_spec, reference, min_value, db_range); - } else { - throw new Error(`Cannot use log_mel option '${log_mel}' with power ${power}`) - } - break; - default: - throw new Error(`log_mel must be one of null, 'log', 'log10' or 'dB'. Got '${log_mel}'`); + break; + case "log10": + for (let i = 0; i < o; ++i) { + mel_spec[i] = Math.log10(mel_spec[i]); } + break; + case "dB": + if (power === 1.0) { + // NOTE: operates in-place + amplitude_to_db(mel_spec, reference, min_value, db_range); + } else if (power === 2.0) { + power_to_db(mel_spec, reference, min_value, db_range); + } else { + throw new Error( + `Cannot use log_mel option '${log_mel}' with power ${power}`, + ); + } + break; + default: + throw new Error( + `log_mel must be one of null, 'log', 'log10' or 'dB'. Got '${log_mel}'`, + ); } + } - return { data: mel_spec, dims }; + return { data: mel_spec, dims }; } /** @@ -632,33 +698,35 @@ export function spectrogram( * @param {boolean} [options.center=true] Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided. * @returns {Float64Array} The window of shape `(window_length,)` or `(frame_length,)`. */ -export function window_function(window_length, name, { - periodic = true, - frame_length = null, - center = true, -} = {}) { - const length = periodic ? window_length + 1 : window_length; - let window; - switch (name) { - case 'boxcar': - window = new Float64Array(length).fill(1.0); - break; - case 'hann': - case 'hann_window': - window = hanning(length); - break; - default: - throw new Error(`Unknown window type ${name}.`); - } - if (periodic) { - window = window.subarray(0, window_length); - } - if (frame_length === null) { - return window; - } - if (window_length > frame_length) { - throw new Error(`Length of the window (${window_length}) may not be larger than frame_length (${frame_length})`); - } - +export function window_function( + window_length, + name, + { periodic = true, frame_length = null, center = true } = {}, +) { + const length = periodic ? window_length + 1 : window_length; + let window; + switch (name) { + case "boxcar": + window = new Float64Array(length).fill(1.0); + break; + case "hann": + case "hann_window": + window = hanning(length); + break; + default: + throw new Error(`Unknown window type ${name}.`); + } + if (periodic) { + window = window.subarray(0, window_length); + } + if (frame_length === null) { return window; + } + if (window_length > frame_length) { + throw new Error( + `Length of the window (${window_length}) may not be larger than frame_length (${frame_length})`, + ); + } + + return window; } diff --git a/core/vendor/modules/@xenova/transformers/src/utils/core.js b/core/vendor/modules/@xenova/transformers/src/utils/core.js index 4ed0f15ef..69685a836 100644 --- a/core/vendor/modules/@xenova/transformers/src/utils/core.js +++ b/core/vendor/modules/@xenova/transformers/src/utils/core.js @@ -1,10 +1,9 @@ - /** * @file Core utility functions/classes for Transformers.js. - * + * * These are only used internally, meaning an end-user shouldn't * need to access anything here. - * + * * @module utils/core */ @@ -17,7 +16,7 @@ * @private */ export function dispatchCallback(progress_callback, data) { - if (progress_callback) progress_callback(data); + if (progress_callback) progress_callback(data); } /** @@ -28,8 +27,10 @@ export function dispatchCallback(progress_callback, data) { * @see https://ultimatecourses.com/blog/reverse-object-keys-and-values-in-javascript */ export function reverseDictionary(data) { - // https://ultimatecourses.com/blog/reverse-object-keys-and-values-in-javascript - return Object.fromEntries(Object.entries(data).map(([key, value]) => [value, key])); + // https://ultimatecourses.com/blog/reverse-object-keys-and-values-in-javascript + return Object.fromEntries( + Object.entries(data).map(([key, value]) => [value, key]), + ); } /** @@ -39,29 +40,30 @@ export function reverseDictionary(data) { * @returns {string} The escaped string. */ export function escapeRegExp(string) { - return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string + return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string } /** * A base class for creating callable objects. - * + * * @type {new () => {(...args: any[]): any, _call(...args: any[]): any}} */ -export const Callable = /** @type {any} */ (class { +export const Callable = /** @type {any} */ ( + class { /** - * Creates a new instance of the Callable class. - */ + * Creates a new instance of the Callable class. + */ constructor() { - /** - * Creates a closure that delegates to a private method '_call' with the given arguments. - * @type {any} - * @param {...any} args Zero or more arguments to pass to the '_call' method. - * @returns {*} The result of calling the '_call' method. - */ - let closure = function (...args) { - return closure._call(...args) - } - return Object.setPrototypeOf(closure, new.target.prototype) + /** + * Creates a closure that delegates to a private method '_call' with the given arguments. + * @type {any} + * @param {...any} args Zero or more arguments to pass to the '_call' method. + * @returns {*} The result of calling the '_call' method. + */ + let closure = function (...args) { + return closure._call(...args); + }; + return Object.setPrototypeOf(closure, new.target.prototype); } /** @@ -72,29 +74,29 @@ export const Callable = /** @type {any} */ (class { * @throws {Error} If the subclass does not implement the `_call` method. */ _call(...args) { - throw Error('Must implement _call method in subclass') + throw Error("Must implement _call method in subclass"); } -}); + } +); /** * Check if a value is a typed array. * @param {*} val The value to check. * @returns {boolean} True if the value is a `TypedArray`, false otherwise. - * + * * Adapted from https://stackoverflow.com/a/71091338/13989043 */ export function isTypedArray(val) { - return val?.prototype?.__proto__?.constructor?.name === 'TypedArray'; + return val?.prototype?.__proto__?.constructor?.name === "TypedArray"; } - /** * Check if a value is an integer. * @param {*} x The value to check. * @returns {boolean} True if the value is a string, false otherwise. */ export function isIntegralNumber(x) { - return Number.isInteger(x) || typeof x === 'bigint' + return Number.isInteger(x) || typeof x === "bigint"; } /** @@ -103,7 +105,7 @@ export function isIntegralNumber(x) { * @returns {boolean} True if the value exists, false otherwise. */ export function exists(x) { - return x !== undefined && x !== null; + return x !== undefined && x !== null; } /** @@ -113,13 +115,13 @@ export function exists(x) { * @returns {number[]} An array containing the dimensions of the input array. */ export function calculateDimensions(arr) { - const dimensions = []; - let current = arr; - while (Array.isArray(current)) { - dimensions.push(current.length); - current = current[0]; - } - return dimensions; + const dimensions = []; + let current = arr; + while (Array.isArray(current)) { + dimensions.push(current.length); + current = current[0]; + } + return dimensions; } /** @@ -131,15 +133,15 @@ export function calculateDimensions(arr) { * @throws {Error} If the key does not exist and no default value is provided. */ export function pop(obj, key, defaultValue = undefined) { - const value = obj[key]; - if (value !== undefined) { - delete obj[key]; - return value; - } - if (defaultValue === undefined) { - throw Error(`Key ${key} does not exist in object.`) - } - return defaultValue; + const value = obj[key]; + if (value !== undefined) { + delete obj[key]; + return value; + } + if (defaultValue === undefined) { + throw Error(`Key ${key} does not exist in object.`); + } + return defaultValue; } /** @@ -149,7 +151,7 @@ export function pop(obj, key, defaultValue = undefined) { * @returns {Array} The merged array. */ export function mergeArrays(...arrs) { - return Array.prototype.concat.apply([], arrs); + return Array.prototype.concat.apply([], arrs); } /** @@ -159,9 +161,9 @@ export function mergeArrays(...arrs) { * @private */ export function product(...a) { - // Cartesian product of items - // Adapted from https://stackoverflow.com/a/43053803 - return a.reduce((a, b) => a.flatMap(d => b.map(e => [d, e]))); + // Cartesian product of items + // Adapted from https://stackoverflow.com/a/43053803 + return a.reduce((a, b) => a.flatMap((d) => b.map((e) => [d, e]))); } /** @@ -171,5 +173,5 @@ export function product(...a) { * @returns {number} The index offset. */ export function calculateReflectOffset(i, w) { - return Math.abs((i + w) % (2 * w) - w); + return Math.abs(((i + w) % (2 * w)) - w); } diff --git a/core/vendor/modules/@xenova/transformers/src/utils/data-structures.js b/core/vendor/modules/@xenova/transformers/src/utils/data-structures.js index dd8a78867..e032fa030 100644 --- a/core/vendor/modules/@xenova/transformers/src/utils/data-structures.js +++ b/core/vendor/modules/@xenova/transformers/src/utils/data-structures.js @@ -1,415 +1,420 @@ - /** * @file Custom data structures. - * + * * These are only used internally, meaning an end-user shouldn't * need to access anything here. - * + * * @module utils/data-structures */ - /** * Efficient Heap-based Implementation of a Priority Queue. * It uses an array-based binary heap, where the root is at index `0`, and the * children of node `i` are located at indices `2i + 1` and `2i + 2`, respectively. - * + * * Adapted from the following sources: * - https://stackoverflow.com/a/42919752/13989043 (original) * - https://github.com/belladoreai/llama-tokenizer-js (minor improvements) */ export class PriorityQueue { + /** + * Create a new PriorityQueue. + * @param {Function} comparator Comparator function to determine priority. Defaults to a MaxHeap. + */ + constructor(comparator = (a, b) => a > b) { + this._heap = []; + this._comparator = comparator; + } - /** - * Create a new PriorityQueue. - * @param {Function} comparator Comparator function to determine priority. Defaults to a MaxHeap. - */ - constructor(comparator = (a, b) => a > b) { - this._heap = []; - this._comparator = comparator; - } + /** + * The size of the queue + */ + get size() { + return this._heap.length; + } - /** - * The size of the queue - */ - get size() { - return this._heap.length; - } + /** + * Check if the queue is empty. + * @returns {boolean} `true` if the queue is empty, `false` otherwise. + */ + isEmpty() { + return this.size === 0; + } - /** - * Check if the queue is empty. - * @returns {boolean} `true` if the queue is empty, `false` otherwise. - */ - isEmpty() { - return this.size === 0; - } + /** + * Return the element with the highest priority in the queue. + * @returns {any} The highest priority element in the queue. + */ + peek() { + return this._heap[0]; + } - /** - * Return the element with the highest priority in the queue. - * @returns {any} The highest priority element in the queue. - */ - peek() { - return this._heap[0]; - } + /** + * Add one or more elements to the queue. + * @param {...any} values The values to push into the queue. + * @returns {number} The new size of the queue. + */ + push(...values) { + return this.extend(values); + } - /** - * Add one or more elements to the queue. - * @param {...any} values The values to push into the queue. - * @returns {number} The new size of the queue. - */ - push(...values) { - return this.extend(values); + /** + * Add multiple elements to the queue. + * @param {any[]} values The values to push into the queue. + * @returns {number} The new size of the queue. + */ + extend(values) { + for (const value of values) { + this._heap.push(value); + this._siftUp(); } + return this.size; + } - /** - * Add multiple elements to the queue. - * @param {any[]} values The values to push into the queue. - * @returns {number} The new size of the queue. - */ - extend(values) { - for (const value of values) { - this._heap.push(value); - this._siftUp(); - } - return this.size; + /** + * Remove and return the element with the highest priority in the queue. + * @returns {any} The element with the highest priority in the queue. + */ + pop() { + const poppedValue = this.peek(); + const bottom = this.size - 1; + if (bottom > 0) { + this._swap(0, bottom); } + this._heap.pop(); + this._siftDown(); + return poppedValue; + } - /** - * Remove and return the element with the highest priority in the queue. - * @returns {any} The element with the highest priority in the queue. - */ - pop() { - const poppedValue = this.peek(); - const bottom = this.size - 1; - if (bottom > 0) { - this._swap(0, bottom); - } - this._heap.pop(); - this._siftDown(); - return poppedValue; - } + /** + * Replace the element with the highest priority in the queue with a new value. + * @param {*} value The new value. + * @returns {*} The replaced value. + */ + replace(value) { + const replacedValue = this.peek(); + this._heap[0] = value; + this._siftDown(); + return replacedValue; + } - /** - * Replace the element with the highest priority in the queue with a new value. - * @param {*} value The new value. - * @returns {*} The replaced value. - */ - replace(value) { - const replacedValue = this.peek(); - this._heap[0] = value; - this._siftDown(); - return replacedValue; - } + /** + * Compute the index for the parent of the node at index `i`. + * @param {number} i The index of the node to get the parent of. + * @returns {number} The index of the parent node. + * @private + */ + _parent(i) { + return ((i + 1) >>> 1) - 1; + } - /** - * Compute the index for the parent of the node at index `i`. - * @param {number} i The index of the node to get the parent of. - * @returns {number} The index of the parent node. - * @private - */ - _parent(i) { - return ((i + 1) >>> 1) - 1; - } + /** + * Compute the index for the left child of the node at index `i`. + * @param {number} i The index of the node to get the left child of. + * @returns {number} The index of the left child. + * @private + */ + _left(i) { + return (i << 1) + 1; + } - /** - * Compute the index for the left child of the node at index `i`. - * @param {number} i The index of the node to get the left child of. - * @returns {number} The index of the left child. - * @private - */ - _left(i) { - return (i << 1) + 1; - } + /** + * Compute the index for the right child of the node at index `i`. + * @param {number} i The index of the node to get the right child of. + * @returns {number} The index of the right child. + * @private + */ + _right(i) { + return (i + 1) << 1; + } - /** - * Compute the index for the right child of the node at index `i`. - * @param {number} i The index of the node to get the right child of. - * @returns {number} The index of the right child. - * @private - */ - _right(i) { - return (i + 1) << 1; - } + /** + * Check if the element at index `i` is greater than the element at index `j`. + * @param {number} i The index of the first element to compare. + * @param {number} j The index of the second element to compare. + * @returns {boolean} `true` if the element at index `i` is greater than the element at index `j`, `false` otherwise. + * @private + */ + _greater(i, j) { + return this._comparator(this._heap[i], this._heap[j]); + } - /** - * Check if the element at index `i` is greater than the element at index `j`. - * @param {number} i The index of the first element to compare. - * @param {number} j The index of the second element to compare. - * @returns {boolean} `true` if the element at index `i` is greater than the element at index `j`, `false` otherwise. - * @private - */ - _greater(i, j) { - return this._comparator(this._heap[i], this._heap[j]); - } + /** + * Swap the elements at indices `i` and `j`. + * @param {number} i The index of the first element to swap. + * @param {number} j The index of the second element to swap. + * @private + */ + _swap(i, j) { + const temp = this._heap[i]; + this._heap[i] = this._heap[j]; + this._heap[j] = temp; + } - /** - * Swap the elements at indices `i` and `j`. - * @param {number} i The index of the first element to swap. - * @param {number} j The index of the second element to swap. - * @private - */ - _swap(i, j) { - const temp = this._heap[i]; - this._heap[i] = this._heap[j]; - this._heap[j] = temp; + /** + * Maintain the heap property by updating positions in the heap, + * starting at the last element and moving up the heap. + * @private + */ + _siftUp() { + let node = this.size - 1; + while (node > 0 && this._greater(node, this._parent(node))) { + this._swap(node, this._parent(node)); + node = this._parent(node); } - - /** - * Maintain the heap property by updating positions in the heap, - * starting at the last element and moving up the heap. - * @private - */ - _siftUp() { - let node = this.size - 1; - while (node > 0 && this._greater(node, this._parent(node))) { - this._swap(node, this._parent(node)); - node = this._parent(node); - } - } - /** - * Maintain the heap property by updating positions in the heap, - * starting at the first element and moving down the heap. - * @private - */ - _siftDown() { - let node = 0; - while ( - (this._left(node) < this.size && this._greater(this._left(node), node)) || - (this._right(node) < this.size && this._greater(this._right(node), node)) - ) { - const maxChild = (this._right(node) < this.size && this._greater(this._right(node), this._left(node))) - ? this._right(node) - : this._left(node); - this._swap(node, maxChild); - node = maxChild; - } + } + /** + * Maintain the heap property by updating positions in the heap, + * starting at the first element and moving down the heap. + * @private + */ + _siftDown() { + let node = 0; + while ( + (this._left(node) < this.size && this._greater(this._left(node), node)) || + (this._right(node) < this.size && this._greater(this._right(node), node)) + ) { + const maxChild = + this._right(node) < this.size && + this._greater(this._right(node), this._left(node)) + ? this._right(node) + : this._left(node); + this._swap(node, maxChild); + node = maxChild; } + } } /** * A trie structure to efficiently store and search for strings. */ export class CharTrie { - constructor() { - this.root = CharTrieNode.default(); - } + constructor() { + this.root = CharTrieNode.default(); + } - /** - * Adds one or more `texts` to the trie. - * @param {string[]} texts The strings to add to the trie. - */ - extend(texts) { - for (let text of texts) { - this.push(text); - } + /** + * Adds one or more `texts` to the trie. + * @param {string[]} texts The strings to add to the trie. + */ + extend(texts) { + for (let text of texts) { + this.push(text); } + } - /** - * Adds text to the trie. - * @param {string} text The string to add to the trie. - */ - push(text) { - let node = this.root; - for (let ch of text) { - let child = node.children.get(ch); - if (child === undefined) { - child = CharTrieNode.default(); - node.children.set(ch, child); - } - node = child; - } - node.isLeaf = true; + /** + * Adds text to the trie. + * @param {string} text The string to add to the trie. + */ + push(text) { + let node = this.root; + for (let ch of text) { + let child = node.children.get(ch); + if (child === undefined) { + child = CharTrieNode.default(); + node.children.set(ch, child); + } + node = child; } + node.isLeaf = true; + } - /** - * Searches the trie for all strings with a common prefix of `text`. - * @param {string} text The common prefix to search for. - * @yields {string} Each string in the trie that has `text` as a prefix. - */ - *commonPrefixSearch(text) { - let node = this.root; - let prefix = ""; - for (let i = 0; i < text.length && node !== undefined; ++i) { - const ch = text[i]; - prefix += ch; - node = node.children.get(ch); - if (node !== undefined && node.isLeaf) { - yield prefix; - } - } + /** + * Searches the trie for all strings with a common prefix of `text`. + * @param {string} text The common prefix to search for. + * @yields {string} Each string in the trie that has `text` as a prefix. + */ + *commonPrefixSearch(text) { + let node = this.root; + let prefix = ""; + for (let i = 0; i < text.length && node !== undefined; ++i) { + const ch = text[i]; + prefix += ch; + node = node.children.get(ch); + if (node !== undefined && node.isLeaf) { + yield prefix; + } } + } } /** * Represents a node in a character trie. */ class CharTrieNode { - /** - * Create a new CharTrieNode. - * @param {boolean} isLeaf Whether the node is a leaf node or not. - * @param {Map} children A map containing the node's children, where the key is a character and the value is a `CharTrieNode`. - */ - constructor(isLeaf, children) { - this.isLeaf = isLeaf; - this.children = children; - } + /** + * Create a new CharTrieNode. + * @param {boolean} isLeaf Whether the node is a leaf node or not. + * @param {Map} children A map containing the node's children, where the key is a character and the value is a `CharTrieNode`. + */ + constructor(isLeaf, children) { + this.isLeaf = isLeaf; + this.children = children; + } - /** - * Returns a new `CharTrieNode` instance with default values. - * @returns {CharTrieNode} A new `CharTrieNode` instance with `isLeaf` set to `false` and an empty `children` map. - */ - static default() { - return new CharTrieNode(false, new Map()); - } + /** + * Returns a new `CharTrieNode` instance with default values. + * @returns {CharTrieNode} A new `CharTrieNode` instance with `isLeaf` set to `false` and an empty `children` map. + */ + static default() { + return new CharTrieNode(false, new Map()); + } } /** * A lattice data structure to be used for tokenization. */ export class TokenLattice { - /** - * Creates a new TokenLattice instance. - * - * @param {string} sentence The input sentence to be tokenized. - * @param {number} bosTokenId The beginning-of-sequence token ID. - * @param {number} eosTokenId The end-of-sequence token ID. - */ - constructor(sentence, bosTokenId, eosTokenId) { - this.sentence = sentence; - this.len = sentence.length; - this.bosTokenId = bosTokenId; - this.eosTokenId = eosTokenId; - this.nodes = []; - this.beginNodes = Array.from({ length: this.len + 1 }, () => []); - this.endNodes = Array.from({ length: this.len + 1 }, () => []); + /** + * Creates a new TokenLattice instance. + * + * @param {string} sentence The input sentence to be tokenized. + * @param {number} bosTokenId The beginning-of-sequence token ID. + * @param {number} eosTokenId The end-of-sequence token ID. + */ + constructor(sentence, bosTokenId, eosTokenId) { + this.sentence = sentence; + this.len = sentence.length; + this.bosTokenId = bosTokenId; + this.eosTokenId = eosTokenId; + this.nodes = []; + this.beginNodes = Array.from({ length: this.len + 1 }, () => []); + this.endNodes = Array.from({ length: this.len + 1 }, () => []); - const bos = new TokenLatticeNode(this.bosTokenId, 0, 0, 0, 0.0); - const eos = new TokenLatticeNode(this.eosTokenId, 1, this.len, 0, 0.0); - this.nodes.push(bos.clone()); - this.nodes.push(eos.clone()); - this.beginNodes[this.len].push(eos); - this.endNodes[0].push(bos); - } + const bos = new TokenLatticeNode(this.bosTokenId, 0, 0, 0, 0.0); + const eos = new TokenLatticeNode(this.eosTokenId, 1, this.len, 0, 0.0); + this.nodes.push(bos.clone()); + this.nodes.push(eos.clone()); + this.beginNodes[this.len].push(eos); + this.endNodes[0].push(bos); + } - /** - * Inserts a new token node into the token lattice. - * - * @param {number} pos The starting position of the token. - * @param {number} length The length of the token. - * @param {number} score The score of the token. - * @param {number} tokenId The token ID of the token. - */ - insert(pos, length, score, tokenId) { - const nodeId = this.nodes.length; - const node = new TokenLatticeNode(tokenId, nodeId, pos, length, score); - this.beginNodes[pos].push(node); - this.endNodes[pos + length].push(node); - this.nodes.push(node); - } + /** + * Inserts a new token node into the token lattice. + * + * @param {number} pos The starting position of the token. + * @param {number} length The length of the token. + * @param {number} score The score of the token. + * @param {number} tokenId The token ID of the token. + */ + insert(pos, length, score, tokenId) { + const nodeId = this.nodes.length; + const node = new TokenLatticeNode(tokenId, nodeId, pos, length, score); + this.beginNodes[pos].push(node); + this.endNodes[pos + length].push(node); + this.nodes.push(node); + } - /** - * Implements the Viterbi algorithm to compute the most likely sequence of tokens. - * - * @returns {TokenLatticeNode[]} The array of nodes representing the most likely sequence of tokens. - */ - viterbi() { - const len = this.len; - let pos = 0; - while (pos <= len) { - if (this.beginNodes[pos].length == 0) { - return []; - } - for (let rnode of this.beginNodes[pos]) { - rnode.prev = null; - let bestScore = 0.0; - let bestNode = null; - for (let lnode of this.endNodes[pos]) { - const score = lnode.backtraceScore + rnode.score; - if (bestNode === null || score > bestScore) { - bestNode = lnode.clone(); - bestScore = score; - } - } - - if (bestNode !== null) { - rnode.prev = bestNode; - rnode.backtraceScore = bestScore; - } else { - return []; - } - } - ++pos; + /** + * Implements the Viterbi algorithm to compute the most likely sequence of tokens. + * + * @returns {TokenLatticeNode[]} The array of nodes representing the most likely sequence of tokens. + */ + viterbi() { + const len = this.len; + let pos = 0; + while (pos <= len) { + if (this.beginNodes[pos].length == 0) { + return []; + } + for (let rnode of this.beginNodes[pos]) { + rnode.prev = null; + let bestScore = 0.0; + let bestNode = null; + for (let lnode of this.endNodes[pos]) { + const score = lnode.backtraceScore + rnode.score; + if (bestNode === null || score > bestScore) { + bestNode = lnode.clone(); + bestScore = score; + } } - const results = []; - const root = this.beginNodes[len][0]; - const prev = root.prev; - if (prev === null) { - return []; + if (bestNode !== null) { + rnode.prev = bestNode; + rnode.backtraceScore = bestScore; + } else { + return []; } - - let node = prev.clone(); - while (node.prev !== null) { - results.push(node.clone()); - const n = node.clone(); - node = n.prev.clone(); - } - - results.reverse(); - return results; + } + ++pos; } - /** - * @param {TokenLatticeNode} node - * @returns {string} The array of nodes representing the most likely sequence of tokens. - */ - piece(node) { - return this.sentence.slice(node.pos, node.pos + node.length); + const results = []; + const root = this.beginNodes[len][0]; + const prev = root.prev; + if (prev === null) { + return []; } - /** - * @returns {Array} The array of nodes representing the most likely sequence of tokens. - */ - tokens() { - const nodes = this.viterbi(); - return nodes.map(x => this.piece(x)); + let node = prev.clone(); + while (node.prev !== null) { + results.push(node.clone()); + const n = node.clone(); + node = n.prev.clone(); } - /** - * @returns {Array} The array of nodes representing the most likely sequence of tokens. - */ - tokenIds() { - const nodes = this.viterbi(); - return nodes.map(x => x.tokenId); - } + results.reverse(); + return results; + } + + /** + * @param {TokenLatticeNode} node + * @returns {string} The array of nodes representing the most likely sequence of tokens. + */ + piece(node) { + return this.sentence.slice(node.pos, node.pos + node.length); + } + + /** + * @returns {Array} The array of nodes representing the most likely sequence of tokens. + */ + tokens() { + const nodes = this.viterbi(); + return nodes.map((x) => this.piece(x)); + } + + /** + * @returns {Array} The array of nodes representing the most likely sequence of tokens. + */ + tokenIds() { + const nodes = this.viterbi(); + return nodes.map((x) => x.tokenId); + } } class TokenLatticeNode { - /** - * Represents a node in a token lattice for a given sentence. - * @param {number} tokenId The ID of the token associated with this node. - * @param {number} nodeId The ID of this node. - * @param {number} pos The starting position of the token in the sentence. - * @param {number} length The length of the token. - * @param {number} score The score associated with the token. - */ - constructor(tokenId, nodeId, pos, length, score) { - this.tokenId = tokenId; - this.nodeId = nodeId; - this.pos = pos; - this.length = length; - this.score = score; - this.prev = null; - this.backtraceScore = 0.0; - } + /** + * Represents a node in a token lattice for a given sentence. + * @param {number} tokenId The ID of the token associated with this node. + * @param {number} nodeId The ID of this node. + * @param {number} pos The starting position of the token in the sentence. + * @param {number} length The length of the token. + * @param {number} score The score associated with the token. + */ + constructor(tokenId, nodeId, pos, length, score) { + this.tokenId = tokenId; + this.nodeId = nodeId; + this.pos = pos; + this.length = length; + this.score = score; + this.prev = null; + this.backtraceScore = 0.0; + } - /** - * Returns a clone of this node. - * @returns {TokenLatticeNode} A clone of this node. - */ - clone() { - const n = new TokenLatticeNode(this.tokenId, this.nodeId, this.pos, this.length, this.score); - n.prev = this.prev; - n.backtraceScore = this.backtraceScore; - return n; - } + /** + * Returns a clone of this node. + * @returns {TokenLatticeNode} A clone of this node. + */ + clone() { + const n = new TokenLatticeNode( + this.tokenId, + this.nodeId, + this.pos, + this.length, + this.score, + ); + n.prev = this.prev; + n.backtraceScore = this.backtraceScore; + return n; + } } diff --git a/core/vendor/modules/@xenova/transformers/src/utils/generation.js b/core/vendor/modules/@xenova/transformers/src/utils/generation.js index 1f9dc898b..d3ef83531 100644 --- a/core/vendor/modules/@xenova/transformers/src/utils/generation.js +++ b/core/vendor/modules/@xenova/transformers/src/utils/generation.js @@ -1,22 +1,13 @@ - /** * @file Classes, functions, and utilities for generation. - * + * * @todo Describe how to create a custom `GenerationConfig`. - * + * * @module utils/generation */ -import { Tensor } from './tensor.js'; -import { - Callable, - exists, -} from './core.js'; -import { - max, - softmax, - log_softmax, - getTopItems, -} from './maths.js'; +import { Tensor } from "./tensor.js"; +import { Callable, exists } from "./core.js"; +import { max, softmax, log_softmax, getTopItems } from "./maths.js"; /** * A class representing a list of logits processors. A logits processor is a function that modifies the logits @@ -26,53 +17,51 @@ import { * @extends Callable */ export class LogitsProcessorList extends Callable { - /** - * Constructs a new instance of `LogitsProcessorList`. - */ - constructor() { - super(); - this.processors = []; - } + /** + * Constructs a new instance of `LogitsProcessorList`. + */ + constructor() { + super(); + this.processors = []; + } - /** - * Adds a new logits processor to the list. - * - * @param {LogitsProcessor} item The logits processor function to add. - */ - push(item) { - this.processors.push(item); - } + /** + * Adds a new logits processor to the list. + * + * @param {LogitsProcessor} item The logits processor function to add. + */ + push(item) { + this.processors.push(item); + } - /** - * Adds multiple logits processors to the list. - * - * @param {LogitsProcessor[]} items The logits processor functions to add. - */ - extend(items) { - this.processors.push(...items); - } + /** + * Adds multiple logits processors to the list. + * + * @param {LogitsProcessor[]} items The logits processor functions to add. + */ + extend(items) { + this.processors.push(...items); + } - /** - * Applies all logits processors in the list to a batch of logits, modifying them in-place. - * - * @param {number[]} input_ids The input IDs for the language model. - * @param {number[][]} batchedLogits A 2D array of logits, where each row corresponds to a single - * input sequence in the batch. - */ - _call(input_ids, batchedLogits) { - // NOTE: This is different from the Python code, since vanilla JS does not support vectorized operations. - // As a result, we apply each processor to each item in the batch. - for (let logits of batchedLogits) { - // Modifies logits inplace - this.processors.forEach( - func => func(input_ids, logits) - ) - } + /** + * Applies all logits processors in the list to a batch of logits, modifying them in-place. + * + * @param {number[]} input_ids The input IDs for the language model. + * @param {number[][]} batchedLogits A 2D array of logits, where each row corresponds to a single + * input sequence in the batch. + */ + _call(input_ids, batchedLogits) { + // NOTE: This is different from the Python code, since vanilla JS does not support vectorized operations. + // As a result, we apply each processor to each item in the batch. + for (let logits of batchedLogits) { + // Modifies logits inplace + this.processors.forEach((func) => func(input_ids, logits)); } + } - [Symbol.iterator]() { - return this.processors.values(); - } + [Symbol.iterator]() { + return this.processors.values(); + } } /** @@ -80,50 +69,51 @@ export class LogitsProcessorList extends Callable { * @extends Callable */ export class LogitsProcessor extends Callable { - /** - * Apply the processor to the input logits. - * - * @abstract - * @param {Array} input_ids The input ids. - * @param {Tensor} logits The logits to process. - * @throws {Error} Throws an error if `_call` is not implemented in the subclass. - */ - _call(input_ids, logits) { - throw Error("`_call` should be implemented in a subclass") - } + /** + * Apply the processor to the input logits. + * + * @abstract + * @param {Array} input_ids The input ids. + * @param {Tensor} logits The logits to process. + * @throws {Error} Throws an error if `_call` is not implemented in the subclass. + */ + _call(input_ids, logits) { + throw Error("`_call` should be implemented in a subclass"); + } } /** * A logits processor that forces a specific token to be generated by the decoder. - * + * * @extends LogitsProcessor */ export class ForceTokensLogitsProcessor extends LogitsProcessor { - /** - * Constructs a new instance of `ForceTokensLogitsProcessor`. - * - * @param {Array} forced_decoder_ids The ids of tokens that should be forced. - */ - constructor(forced_decoder_ids) { - super(); - this.force_token_map = Object.fromEntries(forced_decoder_ids ?? []); - } + /** + * Constructs a new instance of `ForceTokensLogitsProcessor`. + * + * @param {Array} forced_decoder_ids The ids of tokens that should be forced. + */ + constructor(forced_decoder_ids) { + super(); + this.force_token_map = Object.fromEntries(forced_decoder_ids ?? []); + } - /** - * Apply the processor to the input logits. - * - * @param {Array} input_ids The input ids. - * @param {Tensor} logits The logits to process. - * @returns {Tensor} The processed logits. - */ - _call(input_ids, logits) { - let map = this.force_token_map[input_ids.length]; - if (exists(map)) { // There exists a mapping - logits.data.fill(-Infinity) - logits.data[map] = 0; - } - return logits; + /** + * Apply the processor to the input logits. + * + * @param {Array} input_ids The input ids. + * @param {Tensor} logits The logits to process. + * @returns {Tensor} The processed logits. + */ + _call(input_ids, logits) { + let map = this.force_token_map[input_ids.length]; + if (exists(map)) { + // There exists a mapping + logits.data.fill(-Infinity); + logits.data[map] = 0; } + return logits; + } } /** @@ -131,57 +121,57 @@ export class ForceTokensLogitsProcessor extends LogitsProcessor { * @extends LogitsProcessor */ export class ForcedBOSTokenLogitsProcessor extends LogitsProcessor { - /** - * Create a ForcedBOSTokenLogitsProcessor. - * @param {number} bos_token_id The ID of the beginning-of-sequence token to be forced. - */ - constructor(bos_token_id) { - super(); - this.bos_token_id = bos_token_id; - } + /** + * Create a ForcedBOSTokenLogitsProcessor. + * @param {number} bos_token_id The ID of the beginning-of-sequence token to be forced. + */ + constructor(bos_token_id) { + super(); + this.bos_token_id = bos_token_id; + } - /** - * Apply the BOS token forcing to the logits. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The logits with BOS token forcing. - */ - _call(input_ids, logits) { - if (input_ids.length === 1) { - logits.data.fill(-Infinity) - logits.data[this.bos_token_id] = 0; - } - return logits; + /** + * Apply the BOS token forcing to the logits. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The logits with BOS token forcing. + */ + _call(input_ids, logits) { + if (input_ids.length === 1) { + logits.data.fill(-Infinity); + logits.data[this.bos_token_id] = 0; } + return logits; + } } /** * A logits processor that forces end-of-sequence token probability to 1. - * + * * @extends LogitsProcessor */ export class ForcedEOSTokenLogitsProcessor extends LogitsProcessor { - /** - * Create a ForcedEOSTokenLogitsProcessor. - * @param {number} max_length Max length of the sequence. - * @param {number|number[]} forced_eos_token_id The ID of the end-of-sequence token to be forced. - */ - constructor(max_length, forced_eos_token_id) { - super(); - this.max_length = max_length; - this.forced_eos_token_id = forced_eos_token_id; - } + /** + * Create a ForcedEOSTokenLogitsProcessor. + * @param {number} max_length Max length of the sequence. + * @param {number|number[]} forced_eos_token_id The ID of the end-of-sequence token to be forced. + */ + constructor(max_length, forced_eos_token_id) { + super(); + this.max_length = max_length; + this.forced_eos_token_id = forced_eos_token_id; + } - /** - * Apply the processor to input_ids and logits. - * - * @param {number[]} input_ids The input ids. - * @param {Tensor} logits The logits tensor. - */ - _call(input_ids, logits) { - // console.log('call ForcedEOSTokenLogitsProcessor') - // TODO - } + /** + * Apply the processor to input_ids and logits. + * + * @param {number[]} input_ids The input ids. + * @param {Tensor} logits The logits tensor. + */ + _call(input_ids, logits) { + // console.log('call ForcedEOSTokenLogitsProcessor') + // TODO + } } /** @@ -191,31 +181,31 @@ export class ForcedEOSTokenLogitsProcessor extends LogitsProcessor { * @extends LogitsProcessor */ export class SuppressTokensAtBeginLogitsProcessor extends LogitsProcessor { - /** - * Create a SuppressTokensAtBeginLogitsProcessor. - * @param {number[]} begin_suppress_tokens The IDs of the tokens to suppress. - * @param {number} begin_index The number of tokens to generate before suppressing tokens. - */ - constructor(begin_suppress_tokens, begin_index) { - super(); - this.begin_suppress_tokens = begin_suppress_tokens; - this.begin_index = begin_index; - } + /** + * Create a SuppressTokensAtBeginLogitsProcessor. + * @param {number[]} begin_suppress_tokens The IDs of the tokens to suppress. + * @param {number} begin_index The number of tokens to generate before suppressing tokens. + */ + constructor(begin_suppress_tokens, begin_index) { + super(); + this.begin_suppress_tokens = begin_suppress_tokens; + this.begin_index = begin_index; + } - /** - * Apply the BOS token forcing to the logits. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The logits with BOS token forcing. - */ - _call(input_ids, logits) { - if (input_ids.length === this.begin_index) { - for (let token_id of this.begin_suppress_tokens) { - logits.data[token_id] = -Infinity; - } - } - return logits; + /** + * Apply the BOS token forcing to the logits. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The logits with BOS token forcing. + */ + _call(input_ids, logits) { + if (input_ids.length === this.begin_index) { + for (let token_id of this.begin_suppress_tokens) { + logits.data[token_id] = -Infinity; + } } + return logits; + } } /** @@ -223,317 +213,348 @@ export class SuppressTokensAtBeginLogitsProcessor extends LogitsProcessor { * @extends LogitsProcessor */ export class WhisperTimeStampLogitsProcessor extends LogitsProcessor { - /** - * Constructs a new WhisperTimeStampLogitsProcessor. - * @param {Object} generate_config The config object passed to the `generate()` method of a transformer model. - * @param {number} generate_config.eos_token_id The ID of the end-of-sequence token. - * @param {number} generate_config.no_timestamps_token_id The ID of the token used to indicate that a token should not have a timestamp. - * @param {number[][]} [generate_config.forced_decoder_ids] An array of two-element arrays representing decoder IDs that are forced to appear in the output. The second element of each array indicates whether the token is a timestamp. - * @param {number} [generate_config.max_initial_timestamp_index] The maximum index at which an initial timestamp can appear. - */ - constructor(generate_config) { - super(); - this.eos_token_id = generate_config.eos_token_id; - this.no_timestamps_token_id = generate_config.no_timestamps_token_id; - this.timestamp_begin = this.no_timestamps_token_id + 1; + /** + * Constructs a new WhisperTimeStampLogitsProcessor. + * @param {Object} generate_config The config object passed to the `generate()` method of a transformer model. + * @param {number} generate_config.eos_token_id The ID of the end-of-sequence token. + * @param {number} generate_config.no_timestamps_token_id The ID of the token used to indicate that a token should not have a timestamp. + * @param {number[][]} [generate_config.forced_decoder_ids] An array of two-element arrays representing decoder IDs that are forced to appear in the output. The second element of each array indicates whether the token is a timestamp. + * @param {number} [generate_config.max_initial_timestamp_index] The maximum index at which an initial timestamp can appear. + */ + constructor(generate_config) { + super(); + this.eos_token_id = generate_config.eos_token_id; + this.no_timestamps_token_id = generate_config.no_timestamps_token_id; + this.timestamp_begin = this.no_timestamps_token_id + 1; - this.begin_index = (generate_config.forced_decoder_ids || []).length + 2; - if (generate_config.forced_decoder_ids.slice(-1)[0][1] === this.no_timestamps_token_id) { - this.begin_index -= 1; - } - this.max_initial_timestamp_index = generate_config.max_initial_timestamp_index; + this.begin_index = (generate_config.forced_decoder_ids || []).length + 2; + if ( + generate_config.forced_decoder_ids.slice(-1)[0][1] === + this.no_timestamps_token_id + ) { + this.begin_index -= 1; + } + this.max_initial_timestamp_index = + generate_config.max_initial_timestamp_index; + } + /** + * Modify the logits to handle timestamp tokens. + * @param {Array} input_ids The input sequence of tokens. + * @param {Tensor} logits The logits output by the model. + * @returns {Tensor} The modified logits. + */ + _call(input_ids, logits) { + const logitsData = /** @type {Float32Array} */ (logits.data); + + // suppress <|notimestamps|> which is handled by without_timestamps + logitsData[this.no_timestamps_token_id] = -Infinity; + + if (input_ids.length === this.begin_index - 1) { + logitsData.fill(-Infinity); + logitsData[this.timestamp_begin] = 0; + return logits; } - /** - * Modify the logits to handle timestamp tokens. - * @param {Array} input_ids The input sequence of tokens. - * @param {Tensor} logits The logits output by the model. - * @returns {Tensor} The modified logits. - */ - _call(input_ids, logits) { - const logitsData = /** @type {Float32Array} */(logits.data); + // timestamps have to appear in pairs, except directly before eos_token; mask logits accordingly + const seq = input_ids.slice(this.begin_index); + const last_was_timestamp = + seq.length >= 1 && seq[seq.length - 1] >= this.timestamp_begin; + const penultimate_was_timestamp = + seq.length < 2 || seq[seq.length - 2] >= this.timestamp_begin; - // suppress <|notimestamps|> which is handled by without_timestamps - logitsData[this.no_timestamps_token_id] = -Infinity; - - if (input_ids.length === this.begin_index - 1) { - logitsData.fill(-Infinity); - logitsData[this.timestamp_begin] = 0; - return logits; - } - - // timestamps have to appear in pairs, except directly before eos_token; mask logits accordingly - const seq = input_ids.slice(this.begin_index); - const last_was_timestamp = seq.length >= 1 && seq[seq.length - 1] >= this.timestamp_begin; - const penultimate_was_timestamp = seq.length < 2 || seq[seq.length - 2] >= this.timestamp_begin; - - if (last_was_timestamp) { - if (penultimate_was_timestamp) { // has to be non-timestamp - logitsData.subarray(this.timestamp_begin).fill(-Infinity); - } else { // cannot be normal text tokens - logitsData.subarray(0, this.eos_token_id).fill(-Infinity); - } - } - - // apply the `max_initial_timestamp` option - if (input_ids.length === this.begin_index && this.max_initial_timestamp_index !== null) { - const last_allowed = this.timestamp_begin + this.max_initial_timestamp_index; - logitsData.subarray(last_allowed + 1).fill(-Infinity); - } - - // if sum of probability over timestamps is above any other token, sample timestamp - const logprobs = log_softmax(logitsData); - const timestamp_logprob = Math.log(logprobs.subarray(this.timestamp_begin).map(Math.exp).reduce((a, b) => a + b)); - const max_text_token_logprob = max(logprobs.subarray(0, this.timestamp_begin))[0]; - - if (timestamp_logprob > max_text_token_logprob) { - logitsData.subarray(0, this.timestamp_begin).fill(-Infinity); - } - - return logits; + if (last_was_timestamp) { + if (penultimate_was_timestamp) { + // has to be non-timestamp + logitsData.subarray(this.timestamp_begin).fill(-Infinity); + } else { + // cannot be normal text tokens + logitsData.subarray(0, this.eos_token_id).fill(-Infinity); + } } + + // apply the `max_initial_timestamp` option + if ( + input_ids.length === this.begin_index && + this.max_initial_timestamp_index !== null + ) { + const last_allowed = + this.timestamp_begin + this.max_initial_timestamp_index; + logitsData.subarray(last_allowed + 1).fill(-Infinity); + } + + // if sum of probability over timestamps is above any other token, sample timestamp + const logprobs = log_softmax(logitsData); + const timestamp_logprob = Math.log( + logprobs + .subarray(this.timestamp_begin) + .map(Math.exp) + .reduce((a, b) => a + b), + ); + const max_text_token_logprob = max( + logprobs.subarray(0, this.timestamp_begin), + )[0]; + + if (timestamp_logprob > max_text_token_logprob) { + logitsData.subarray(0, this.timestamp_begin).fill(-Infinity); + } + + return logits; + } } /** * A logits processor that disallows ngrams of a certain size to be repeated. - * + * * @extends LogitsProcessor */ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor { - /** - * Create a NoRepeatNGramLogitsProcessor. - * @param {number} no_repeat_ngram_size The no-repeat-ngram size. All ngrams of this size can only occur once. - */ - constructor(no_repeat_ngram_size) { - super(); - this.no_repeat_ngram_size = no_repeat_ngram_size; + /** + * Create a NoRepeatNGramLogitsProcessor. + * @param {number} no_repeat_ngram_size The no-repeat-ngram size. All ngrams of this size can only occur once. + */ + constructor(no_repeat_ngram_size) { + super(); + this.no_repeat_ngram_size = no_repeat_ngram_size; + } + + /** + * Generate n-grams from a sequence of token ids. + * @param {number[]} prevInputIds List of previous input ids + * @returns {Map} Map of generated n-grams + */ + getNgrams(prevInputIds) { + const curLen = prevInputIds.length; + + /**@type {number[][]} */ + const ngrams = []; + for (let j = 0; j < curLen + 1 - this.no_repeat_ngram_size; ++j) { + const ngram = []; + for (let k = 0; k < this.no_repeat_ngram_size; ++k) { + ngram.push(prevInputIds[j + k]); + } + ngrams.push(ngram); } - /** - * Generate n-grams from a sequence of token ids. - * @param {number[]} prevInputIds List of previous input ids - * @returns {Map} Map of generated n-grams - */ - getNgrams(prevInputIds) { - const curLen = prevInputIds.length; - - /**@type {number[][]} */ - const ngrams = []; - for (let j = 0; j < curLen + 1 - this.no_repeat_ngram_size; ++j) { - const ngram = []; - for (let k = 0; k < this.no_repeat_ngram_size; ++k) { - ngram.push(prevInputIds[j + k]); - } - ngrams.push(ngram); - } - - /** @type {Map} */ - const generatedNgram = new Map(); - for (const ngram of ngrams) { - const prevNgram = ngram.slice(0, ngram.length - 1); - const prevNgramKey = JSON.stringify(prevNgram); - const prevNgramValue = generatedNgram.get(prevNgramKey) ?? []; - prevNgramValue.push(ngram[ngram.length - 1]); - generatedNgram.set(prevNgramKey, prevNgramValue); - } - return generatedNgram; + /** @type {Map} */ + const generatedNgram = new Map(); + for (const ngram of ngrams) { + const prevNgram = ngram.slice(0, ngram.length - 1); + const prevNgramKey = JSON.stringify(prevNgram); + const prevNgramValue = generatedNgram.get(prevNgramKey) ?? []; + prevNgramValue.push(ngram[ngram.length - 1]); + generatedNgram.set(prevNgramKey, prevNgramValue); } + return generatedNgram; + } - /** - * Generate n-grams from a sequence of token ids. - * @param {Map} bannedNgrams Map of banned n-grams - * @param {number[]} prevInputIds List of previous input ids - * @returns {number[]} Map of generated n-grams - */ - getGeneratedNgrams(bannedNgrams, prevInputIds) { - const ngramIdx = prevInputIds.slice(prevInputIds.length + 1 - this.no_repeat_ngram_size, prevInputIds.length); - const banned = bannedNgrams.get(JSON.stringify(ngramIdx)) ?? []; - return banned; + /** + * Generate n-grams from a sequence of token ids. + * @param {Map} bannedNgrams Map of banned n-grams + * @param {number[]} prevInputIds List of previous input ids + * @returns {number[]} Map of generated n-grams + */ + getGeneratedNgrams(bannedNgrams, prevInputIds) { + const ngramIdx = prevInputIds.slice( + prevInputIds.length + 1 - this.no_repeat_ngram_size, + prevInputIds.length, + ); + const banned = bannedNgrams.get(JSON.stringify(ngramIdx)) ?? []; + return banned; + } + + /** + * Calculate banned n-gram tokens + * @param {number[]} prevInputIds List of previous input ids + * @returns {number[]} Map of generated n-grams + */ + calcBannedNgramTokens(prevInputIds) { + const bannedTokens = []; + if (prevInputIds.length + 1 < this.no_repeat_ngram_size) { + // return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return bannedTokens; + } else { + const generatedNgrams = this.getNgrams(prevInputIds); + const bannedTokens = this.getGeneratedNgrams( + generatedNgrams, + prevInputIds, + ); + return bannedTokens; } + } - /** - * Calculate banned n-gram tokens - * @param {number[]} prevInputIds List of previous input ids - * @returns {number[]} Map of generated n-grams - */ - calcBannedNgramTokens(prevInputIds) { - const bannedTokens = []; - if (prevInputIds.length + 1 < this.no_repeat_ngram_size) { - // return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet - return bannedTokens; + /** + * Apply the no-repeat-ngram processor to the logits. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The logits with no-repeat-ngram processing. + */ + _call(input_ids, logits) { + const bannedTokens = this.calcBannedNgramTokens(input_ids); - } else { - const generatedNgrams = this.getNgrams(prevInputIds); - const bannedTokens = this.getGeneratedNgrams(generatedNgrams, prevInputIds); - return bannedTokens; - } - } - - /** - * Apply the no-repeat-ngram processor to the logits. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The logits with no-repeat-ngram processing. - */ - _call(input_ids, logits) { - const bannedTokens = this.calcBannedNgramTokens(input_ids); - - for (const token of bannedTokens) { - logits.data[token] = -Infinity; - } - return logits; + for (const token of bannedTokens) { + logits.data[token] = -Infinity; } + return logits; + } } /** * A logits processor that penalises repeated output tokens. - * + * * @extends LogitsProcessor */ export class RepetitionPenaltyLogitsProcessor extends LogitsProcessor { - /** - * Create a RepetitionPenaltyLogitsProcessor. - * @param {number} penalty The penalty to apply for repeated tokens. - */ - constructor(penalty) { - super(); - this.penalty = penalty; - } + /** + * Create a RepetitionPenaltyLogitsProcessor. + * @param {number} penalty The penalty to apply for repeated tokens. + */ + constructor(penalty) { + super(); + this.penalty = penalty; + } - /** - * Apply the repetition penalty to the logits. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The logits with repetition penalty processing. - */ - _call(input_ids, logits) { - // Modify the logits corresponding to each element in `input_ids`. - // As a consequence, the logits corresponding to tokens that appear - // many times in the output will be penalised more. - for (const input_id of input_ids) { - if (logits.data[input_id] < 0) { - logits.data[input_id] *= this.penalty; - } else { - logits.data[input_id] /= this.penalty; - } - } - return logits + /** + * Apply the repetition penalty to the logits. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The logits with repetition penalty processing. + */ + _call(input_ids, logits) { + // Modify the logits corresponding to each element in `input_ids`. + // As a consequence, the logits corresponding to tokens that appear + // many times in the output will be penalised more. + for (const input_id of input_ids) { + if (logits.data[input_id] < 0) { + logits.data[input_id] *= this.penalty; + } else { + logits.data[input_id] /= this.penalty; + } } + return logits; + } } /** * A logits processor that enforces a minimum number of tokens. - * + * * @extends LogitsProcessor */ export class MinLengthLogitsProcessor extends LogitsProcessor { - /** - * Create a MinLengthLogitsProcessor. - * @param {number} min_length The minimum length below which the score of `eos_token_id` is set to negative infinity. - * @param {number|number[]} eos_token_id The ID/IDs of the end-of-sequence token. - */ - constructor(min_length, eos_token_id) { - super(); - this.min_length = min_length; - this.eos_token_id = Array.isArray(eos_token_id) ? eos_token_id : [eos_token_id]; + /** + * Create a MinLengthLogitsProcessor. + * @param {number} min_length The minimum length below which the score of `eos_token_id` is set to negative infinity. + * @param {number|number[]} eos_token_id The ID/IDs of the end-of-sequence token. + */ + constructor(min_length, eos_token_id) { + super(); + this.min_length = min_length; + this.eos_token_id = Array.isArray(eos_token_id) + ? eos_token_id + : [eos_token_id]; + } + + /** + * Apply logit processor. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The processed logits. + */ + _call(input_ids, logits) { + if (input_ids.length < this.min_length) { + for (const eos_token of this.eos_token_id) { + logits.data[eos_token] = -Infinity; + } } - /** - * Apply logit processor. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The processed logits. - */ - _call(input_ids, logits) { - if (input_ids.length < this.min_length) { - for (const eos_token of this.eos_token_id) { - logits.data[eos_token] = -Infinity; - } - } - - return logits - } + return logits; + } } /** * A logits processor that enforces a minimum number of new tokens. - * + * * @extends LogitsProcessor */ export class MinNewTokensLengthLogitsProcessor extends LogitsProcessor { - /** - * Create a MinNewTokensLengthLogitsProcessor. - * @param {number} prompt_length_to_skip The input tokens length. - * @param {number} min_new_tokens The minimum *new* tokens length below which the score of `eos_token_id` is set to negative infinity. - * @param {number|number[]} eos_token_id The ID/IDs of the end-of-sequence token. - */ - constructor(prompt_length_to_skip, min_new_tokens, eos_token_id) { - super(); - this.prompt_length_to_skip = prompt_length_to_skip; - this.min_new_tokens = min_new_tokens; - this.eos_token_id = Array.isArray(eos_token_id) ? eos_token_id : [eos_token_id]; + /** + * Create a MinNewTokensLengthLogitsProcessor. + * @param {number} prompt_length_to_skip The input tokens length. + * @param {number} min_new_tokens The minimum *new* tokens length below which the score of `eos_token_id` is set to negative infinity. + * @param {number|number[]} eos_token_id The ID/IDs of the end-of-sequence token. + */ + constructor(prompt_length_to_skip, min_new_tokens, eos_token_id) { + super(); + this.prompt_length_to_skip = prompt_length_to_skip; + this.min_new_tokens = min_new_tokens; + this.eos_token_id = Array.isArray(eos_token_id) + ? eos_token_id + : [eos_token_id]; + } + + /** + * Apply logit processor. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The processed logits. + */ + _call(input_ids, logits) { + const new_tokens_length = input_ids.length - this.prompt_length_to_skip; + if (new_tokens_length < this.min_new_tokens) { + for (const eos_token of this.eos_token_id) { + logits.data[eos_token] = -Infinity; + } } - /** - * Apply logit processor. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The processed logits. - */ - _call(input_ids, logits) { - const new_tokens_length = input_ids.length - this.prompt_length_to_skip; - if (new_tokens_length < this.min_new_tokens) { - for (const eos_token of this.eos_token_id) { - logits.data[eos_token] = -Infinity; - } - } - - return logits - } + return logits; + } } export class NoBadWordsLogitsProcessor extends LogitsProcessor { - /** - * Create a `NoBadWordsLogitsProcessor`. - * @param {number[][]} bad_words_ids List of list of token ids that are not allowed to be generated. - * @param {number|number[]} eos_token_id The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - */ - constructor(bad_words_ids, eos_token_id) { - super(); - this.bad_words_ids = bad_words_ids; - this.eos_token_id = Array.isArray(eos_token_id) ? eos_token_id : [eos_token_id]; - } + /** + * Create a `NoBadWordsLogitsProcessor`. + * @param {number[][]} bad_words_ids List of list of token ids that are not allowed to be generated. + * @param {number|number[]} eos_token_id The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + */ + constructor(bad_words_ids, eos_token_id) { + super(); + this.bad_words_ids = bad_words_ids; + this.eos_token_id = Array.isArray(eos_token_id) + ? eos_token_id + : [eos_token_id]; + } - /** - * Apply logit processor. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The processed logits. - */ - _call(input_ids, logits) { + /** + * Apply logit processor. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The processed logits. + */ + _call(input_ids, logits) { + for (const bad_word_ids of this.bad_words_ids) { + // Whether to modify the logits of the last token in the bad word id sequence + let mark = true; - for (const bad_word_ids of this.bad_words_ids) { - // Whether to modify the logits of the last token in the bad word id sequence - let mark = true; - - // For each bad word in the list, if the current sequence of input ids ends with this sequence (excluding the last), - // then we set the logits of the last bad word id to -Infinity. - for (let i = 1; i <= bad_word_ids.length - 1 && bad_word_ids.length < input_ids.length; ++i) { - - if (bad_word_ids.at(-i - 1) !== input_ids.at(-i)) { - // We have found a mismatch - mark = false; - break; - } - } - if (mark) { - logits.data[bad_word_ids.at(-1)] = -Infinity; - } + // For each bad word in the list, if the current sequence of input ids ends with this sequence (excluding the last), + // then we set the logits of the last bad word id to -Infinity. + for ( + let i = 1; + i <= bad_word_ids.length - 1 && bad_word_ids.length < input_ids.length; + ++i + ) { + if (bad_word_ids.at(-i - 1) !== input_ids.at(-i)) { + // We have found a mismatch + mark = false; + break; } - - return logits + } + if (mark) { + logits.data[bad_word_ids.at(-1)] = -Infinity; + } } + + return logits; + } } /** @@ -569,7 +590,7 @@ export class NoBadWordsLogitsProcessor extends LogitsProcessor { * @property {number[][]|number[][][]} [force_words_ids=null] List of token ids that must be generated. If given a `number[][]`, this is treated as a simple list of words that must be included, the opposite to `bad_words_ids`. If given `number[][][]`, this triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one can allow different forms of each word. * @property {boolean} [renormalize_logits=false] Whether to renormalize the logits after applying all the logits processors or warpers (including the custom ones). It's highly recommended to set this flag to `true` as the search algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization. * @property {Object[]} [constraints=null] Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by `Constraint` objects, in the most sensible way possible. - * + * * @property {number} [forced_bos_token_id=null] The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for multilingual models like mBART where the first generated token needs to be the target language token. * @property {number|number[]} [forced_eos_token_id=null] The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens. * @property {boolean} [remove_invalid_values=false] Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation. @@ -577,20 +598,20 @@ export class NoBadWordsLogitsProcessor extends LogitsProcessor { * @property {number[]} [suppress_tokens=null] A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled. * @property {number[]} [begin_suppress_tokens=null] A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. * @property {number[][]} [forced_decoder_ids=null] A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123. - * + * * @property {number} [num_return_sequences=1] The number of independently computed returned sequences for each element in the batch. * @property {boolean} [output_attentions=false] Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details. * @property {boolean} [output_hidden_states=false] Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details. * @property {boolean} [output_scores=false] Whether or not to return the prediction scores. See `scores` under returned tensors for more details. * @property {boolean} [return_dict_in_generate=false] Whether or not to return a `ModelOutput` instead of a plain tuple. - * + * * @property {number} [pad_token_id=null] The id of the *padding* token. * @property {number} [bos_token_id=null] The id of the *beginning-of-sequence* token. * @property {number|number[]} [eos_token_id=null] The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - * + * * @property {number} [encoder_no_repeat_ngram_size=0] If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`. * @property {number} [decoder_start_token_id=null] If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. - * + * * @property {Object} [generation_kwargs={}] Additional generation kwargs will be forwarded to the `generate` function of the model. Kwargs that are not present in `generate`'s signature will be used in the model forward pass. */ @@ -598,180 +619,184 @@ export class NoBadWordsLogitsProcessor extends LogitsProcessor { * Class that holds a configuration for a generation task. * @type {new (kwargs?: GenerationConfigType) => GenerationConfigType} */ -export const GenerationConfig = /** @type {any} */ (class { - +export const GenerationConfig = /** @type {any} */ ( + class { /** * Create a new GenerationConfig object. - * @param {GenerationConfigType} kwargs + * @param {GenerationConfigType} kwargs */ constructor(kwargs = {}) { - // Parameters that control the length of the output - this.max_length = kwargs.max_length ?? 20; - this.max_new_tokens = kwargs.max_new_tokens ?? null; - this.min_length = kwargs.min_length ?? 0; - this.min_new_tokens = kwargs.min_new_tokens ?? null; - this.early_stopping = kwargs.early_stopping ?? false; - this.max_time = kwargs.max_time ?? null; + // Parameters that control the length of the output + this.max_length = kwargs.max_length ?? 20; + this.max_new_tokens = kwargs.max_new_tokens ?? null; + this.min_length = kwargs.min_length ?? 0; + this.min_new_tokens = kwargs.min_new_tokens ?? null; + this.early_stopping = kwargs.early_stopping ?? false; + this.max_time = kwargs.max_time ?? null; - // Parameters that control the generation strategy used - this.do_sample = kwargs.do_sample ?? false; - this.num_beams = kwargs.num_beams ?? 1; - this.num_beam_groups = kwargs.num_beam_groups ?? 1; - this.penalty_alpha = kwargs.penalty_alpha ?? null; - this.use_cache = kwargs.use_cache ?? true; + // Parameters that control the generation strategy used + this.do_sample = kwargs.do_sample ?? false; + this.num_beams = kwargs.num_beams ?? 1; + this.num_beam_groups = kwargs.num_beam_groups ?? 1; + this.penalty_alpha = kwargs.penalty_alpha ?? null; + this.use_cache = kwargs.use_cache ?? true; - // Parameters for manipulation of the model output logits - this.temperature = kwargs.temperature ?? 1.0; - this.top_k = kwargs.top_k ?? 50; - this.top_p = kwargs.top_p ?? 1.0; - this.typical_p = kwargs.typical_p ?? 1.0; - this.epsilon_cutoff = kwargs.epsilon_cutoff ?? 0.0; - this.eta_cutoff = kwargs.eta_cutoff ?? 0.0; - this.diversity_penalty = kwargs.diversity_penalty ?? 0.0; - this.repetition_penalty = kwargs.repetition_penalty ?? 1.0; - this.encoder_repetition_penalty = kwargs.encoder_repetition_penalty ?? 1.0; - this.length_penalty = kwargs.length_penalty ?? 1.0; - this.no_repeat_ngram_size = kwargs.no_repeat_ngram_size ?? 0; - this.bad_words_ids = kwargs.bad_words_ids ?? null; - this.force_words_ids = kwargs.force_words_ids ?? null; - this.renormalize_logits = kwargs.renormalize_logits ?? false; - this.constraints = kwargs.constraints ?? null; - this.forced_bos_token_id = kwargs.forced_bos_token_id ?? null; - this.forced_eos_token_id = kwargs.forced_eos_token_id ?? null; - this.remove_invalid_values = kwargs.remove_invalid_values ?? false; - this.exponential_decay_length_penalty = kwargs.exponential_decay_length_penalty ?? null; - this.suppress_tokens = kwargs.suppress_tokens ?? null; - this.begin_suppress_tokens = kwargs.begin_suppress_tokens ?? null; - this.forced_decoder_ids = kwargs.forced_decoder_ids ?? null; + // Parameters for manipulation of the model output logits + this.temperature = kwargs.temperature ?? 1.0; + this.top_k = kwargs.top_k ?? 50; + this.top_p = kwargs.top_p ?? 1.0; + this.typical_p = kwargs.typical_p ?? 1.0; + this.epsilon_cutoff = kwargs.epsilon_cutoff ?? 0.0; + this.eta_cutoff = kwargs.eta_cutoff ?? 0.0; + this.diversity_penalty = kwargs.diversity_penalty ?? 0.0; + this.repetition_penalty = kwargs.repetition_penalty ?? 1.0; + this.encoder_repetition_penalty = + kwargs.encoder_repetition_penalty ?? 1.0; + this.length_penalty = kwargs.length_penalty ?? 1.0; + this.no_repeat_ngram_size = kwargs.no_repeat_ngram_size ?? 0; + this.bad_words_ids = kwargs.bad_words_ids ?? null; + this.force_words_ids = kwargs.force_words_ids ?? null; + this.renormalize_logits = kwargs.renormalize_logits ?? false; + this.constraints = kwargs.constraints ?? null; + this.forced_bos_token_id = kwargs.forced_bos_token_id ?? null; + this.forced_eos_token_id = kwargs.forced_eos_token_id ?? null; + this.remove_invalid_values = kwargs.remove_invalid_values ?? false; + this.exponential_decay_length_penalty = + kwargs.exponential_decay_length_penalty ?? null; + this.suppress_tokens = kwargs.suppress_tokens ?? null; + this.begin_suppress_tokens = kwargs.begin_suppress_tokens ?? null; + this.forced_decoder_ids = kwargs.forced_decoder_ids ?? null; - // Parameters that define the output variables of `generate` - this.num_return_sequences = kwargs.num_return_sequences ?? 1; - this.output_attentions = kwargs.output_attentions ?? false; - this.output_hidden_states = kwargs.output_hidden_states ?? false; - this.output_scores = kwargs.output_scores ?? false; - this.return_dict_in_generate = kwargs.return_dict_in_generate ?? false; + // Parameters that define the output variables of `generate` + this.num_return_sequences = kwargs.num_return_sequences ?? 1; + this.output_attentions = kwargs.output_attentions ?? false; + this.output_hidden_states = kwargs.output_hidden_states ?? false; + this.output_scores = kwargs.output_scores ?? false; + this.return_dict_in_generate = kwargs.return_dict_in_generate ?? false; - // Special tokens that can be used at generation time - this.pad_token_id = kwargs.pad_token_id ?? null; - this.bos_token_id = kwargs.bos_token_id ?? null; - this.eos_token_id = kwargs.eos_token_id ?? null; + // Special tokens that can be used at generation time + this.pad_token_id = kwargs.pad_token_id ?? null; + this.bos_token_id = kwargs.bos_token_id ?? null; + this.eos_token_id = kwargs.eos_token_id ?? null; - // Generation parameters exclusive to encoder-decoder models - this.encoder_no_repeat_ngram_size = kwargs.encoder_no_repeat_ngram_size ?? 0; - this.decoder_start_token_id = kwargs.decoder_start_token_id ?? null; + // Generation parameters exclusive to encoder-decoder models + this.encoder_no_repeat_ngram_size = + kwargs.encoder_no_repeat_ngram_size ?? 0; + this.decoder_start_token_id = kwargs.decoder_start_token_id ?? null; - // Wild card - this.generation_kwargs = kwargs.generation_kwargs ?? {}; + // Wild card + this.generation_kwargs = kwargs.generation_kwargs ?? {}; } -}); + } +); /** * Sampler is a base class for all sampling methods used for text generation. */ export class Sampler extends Callable { - /** - * Creates a new Sampler object with the specified generation config. - * @param {GenerationConfigType} generation_config The generation config. - */ - constructor(generation_config) { - super(); - this.generation_config = generation_config; + /** + * Creates a new Sampler object with the specified generation config. + * @param {GenerationConfigType} generation_config The generation config. + */ + constructor(generation_config) { + super(); + this.generation_config = generation_config; + } + + /** + * Executes the sampler, using the specified logits. + * @param {Tensor} logits + * @param {number} index + * @returns {void} + */ + _call(logits, index = -1) { + // Sample from logits, of dims [batch, sequence_length, vocab_size]. + // If index is specified, sample from [batch, index, vocab_size]. + return this.sample(logits, index); + } + + /** + * Abstract method for sampling the logits. + * @param {Tensor} logits + * @param {number} index + * @throws {Error} + */ + sample(logits, index) { + throw Error("sample should be implemented in subclasses."); + } + + /** + * Returns the specified logits as an array, with temperature applied. + * @param {Tensor} logits + * @param {number} index + * @returns {Float32Array} + */ + getLogits(logits, index) { + let vocabSize = logits.dims.at(-1); + + let logs = /** @type {Float32Array} */ (logits.data); + + if (index === -1) { + logs = logs.slice(-vocabSize); + } else { + let startIndex = index * vocabSize; + logs = logs.slice(startIndex, startIndex + vocabSize); } - /** - * Executes the sampler, using the specified logits. - * @param {Tensor} logits - * @param {number} index - * @returns {void} - */ - _call(logits, index = -1) { - // Sample from logits, of dims [batch, sequence_length, vocab_size]. - // If index is specified, sample from [batch, index, vocab_size]. - return this.sample(logits, index); + // add temperature + if (this.generation_config.temperature > 0) { + logs = logs.map((x) => x / this.generation_config.temperature); } + return logs; + } - /** - * Abstract method for sampling the logits. - * @param {Tensor} logits - * @param {number} index - * @throws {Error} - */ - sample(logits, index) { - throw Error("sample should be implemented in subclasses.") + /** + * Selects an item randomly based on the specified probabilities. + * @param {Array} probabilities An array of probabilities to use for selection. + * @returns {number} The index of the selected item. + */ + randomSelect(probabilities) { + // Return index of chosen item + let sumProbabilities = probabilities.reduce((acc, curr) => acc + curr, 0); + + let r = Math.random() * sumProbabilities; + for (let i = 0; i < probabilities.length; ++i) { + r -= probabilities[i]; + if (r <= 0) { + return i; + } } + return 0; // return first (most probable) as a fallback + } - /** - * Returns the specified logits as an array, with temperature applied. - * @param {Tensor} logits - * @param {number} index - * @returns {Float32Array} - */ - getLogits(logits, index) { - let vocabSize = logits.dims.at(-1); + /** + * Returns a Sampler object based on the specified options. + * @param {GenerationConfigType} generation_config An object containing options for the sampler. + * @returns {Sampler} A Sampler object. + */ + static getSampler(generation_config) { + // - *greedy decoding*: `num_beams=1` and `do_sample=False` + // - *contrastive search*: `penalty_alpha>0` and `top_k>1` + // - *multinomial sampling*: `num_beams=1` and `do_sample=True` + // - *beam-search decoding*: `num_beams>1` and `do_sample=False` + // - *beam-search multinomial sampling*: `num_beams>1` and `do_sample=True` + // - *diverse beam-search decoding*: `num_beams>1` and `num_beam_groups>1` + // - *constrained beam-search decoding*: `constraints!=None` or `force_words_ids!=None` - let logs = /** @type {Float32Array} */(logits.data); - - if (index === -1) { - logs = logs.slice(-vocabSize); - } else { - let startIndex = index * vocabSize; - logs = logs.slice(startIndex, startIndex + vocabSize); - } - - // add temperature - if (this.generation_config.temperature > 0) { - logs = logs.map(x => x / this.generation_config.temperature) - } - return logs; - } - - /** - * Selects an item randomly based on the specified probabilities. - * @param {Array} probabilities An array of probabilities to use for selection. - * @returns {number} The index of the selected item. - */ - randomSelect(probabilities) { - // Return index of chosen item - let sumProbabilities = probabilities.reduce((acc, curr) => acc + curr, 0); - - let r = Math.random() * sumProbabilities; - for (let i = 0; i < probabilities.length; ++i) { - r -= probabilities[i]; - if (r <= 0) { - return i; - } - } - return 0; // return first (most probable) as a fallback - } - - /** - * Returns a Sampler object based on the specified options. - * @param {GenerationConfigType} generation_config An object containing options for the sampler. - * @returns {Sampler} A Sampler object. - */ - static getSampler(generation_config) { - // - *greedy decoding*: `num_beams=1` and `do_sample=False` - // - *contrastive search*: `penalty_alpha>0` and `top_k>1` - // - *multinomial sampling*: `num_beams=1` and `do_sample=True` - // - *beam-search decoding*: `num_beams>1` and `do_sample=False` - // - *beam-search multinomial sampling*: `num_beams>1` and `do_sample=True` - // - *diverse beam-search decoding*: `num_beams>1` and `num_beam_groups>1` - // - *constrained beam-search decoding*: `constraints!=None` or `force_words_ids!=None` - - // NOTE: beam search is implemented directly into the generation function - if (generation_config.do_sample) { - return new MultinomialSampler(generation_config); - - } else if (generation_config.num_beams > 1) { - return new BeamSearchSampler(generation_config); - - } else { - if (generation_config.num_return_sequences > 1) { - throw Error(`num_return_sequences has to be 1 when doing greedy search, but is ${generation_config.num_return_sequences}.`) - } - return new GreedySampler(generation_config); - } + // NOTE: beam search is implemented directly into the generation function + if (generation_config.do_sample) { + return new MultinomialSampler(generation_config); + } else if (generation_config.num_beams > 1) { + return new BeamSearchSampler(generation_config); + } else { + if (generation_config.num_return_sequences > 1) { + throw Error( + `num_return_sequences has to be 1 when doing greedy search, but is ${generation_config.num_return_sequences}.`, + ); + } + return new GreedySampler(generation_config); } + } } /** @@ -779,23 +804,21 @@ export class Sampler extends Callable { * @extends Sampler */ class GreedySampler extends Sampler { - /** - * Sample the maximum probability of a given logits tensor. - * @param {Tensor} logits - * @param {number} [index=-1] - * @returns {Array} An array with a single tuple, containing the index of the maximum value and a meaningless score (since this is a greedy search). - */ - sample(logits, index = -1) { - // NOTE: no need to do log_softmax here since we only take the maximum - let logs = this.getLogits(logits, index); - let argmax = max(logs)[1]; + /** + * Sample the maximum probability of a given logits tensor. + * @param {Tensor} logits + * @param {number} [index=-1] + * @returns {Array} An array with a single tuple, containing the index of the maximum value and a meaningless score (since this is a greedy search). + */ + sample(logits, index = -1) { + // NOTE: no need to do log_softmax here since we only take the maximum + let logs = this.getLogits(logits, index); + let argmax = max(logs)[1]; - // Note: score is meaningless in this context, since we are performing - // greedy search (p = 1 => log(p) = 0) - return [ - [argmax, 0] - ]; - } + // Note: score is meaningless in this context, since we are performing + // greedy search (p = 1 => log(p) = 0) + return [[argmax, 0]]; + } } /** @@ -803,71 +826,68 @@ class GreedySampler extends Sampler { * @extends Sampler */ class MultinomialSampler extends Sampler { - - /** - * Sample from the logits. - * @param {Tensor} logits - * @param {number} index - * @returns {Array} - */ - sample(logits, index = -1) { - let k = logits.dims.at(-1); // defaults to vocab size - if (this.generation_config.top_k > 0) { - k = Math.min(this.generation_config.top_k, k); - } - - // Get logits of nth token - const logs = this.getLogits(logits, index); - - // Get top k tokens - const topLogits = getTopItems(logs, k); - - // Compute softmax over logits - const probabilities = softmax(topLogits.map(x => x[1])); - - return Array.from({ length: this.generation_config.num_beams }, () => { - const sampledIndex = this.randomSelect(probabilities); - return [ - topLogits[sampledIndex][0], // token id - Math.log(probabilities[sampledIndex]), // score - ]; - }); + /** + * Sample from the logits. + * @param {Tensor} logits + * @param {number} index + * @returns {Array} + */ + sample(logits, index = -1) { + let k = logits.dims.at(-1); // defaults to vocab size + if (this.generation_config.top_k > 0) { + k = Math.min(this.generation_config.top_k, k); } -} + // Get logits of nth token + const logs = this.getLogits(logits, index); + + // Get top k tokens + const topLogits = getTopItems(logs, k); + + // Compute softmax over logits + const probabilities = softmax(topLogits.map((x) => x[1])); + + return Array.from({ length: this.generation_config.num_beams }, () => { + const sampledIndex = this.randomSelect(probabilities); + return [ + topLogits[sampledIndex][0], // token id + Math.log(probabilities[sampledIndex]), // score + ]; + }); + } +} /** * Class representing a BeamSearchSampler. * @extends Sampler */ class BeamSearchSampler extends Sampler { - - /** - * Sample from the logits. - * @param {Tensor} logits - * @param {number} index - * @returns {Array} - */ - sample(logits, index = -1) { - let k = logits.dims.at(-1); // defaults to vocab size - if (this.generation_config.top_k > 0) { - k = Math.min(this.generation_config.top_k, k); - } - - // Get logits of nth token - const logs = this.getLogits(logits, index); - - // Get top k tokens - const topLogits = getTopItems(logs, k); - - // Compute softmax over logits - const probabilities = softmax(topLogits.map(x => x[1])); - - return Array.from({ length: this.generation_config.num_beams }, (_, i) => { - return [ - topLogits[i][0], // token id - Math.log(probabilities[i]), // score - ]; - }); + /** + * Sample from the logits. + * @param {Tensor} logits + * @param {number} index + * @returns {Array} + */ + sample(logits, index = -1) { + let k = logits.dims.at(-1); // defaults to vocab size + if (this.generation_config.top_k > 0) { + k = Math.min(this.generation_config.top_k, k); } + + // Get logits of nth token + const logs = this.getLogits(logits, index); + + // Get top k tokens + const topLogits = getTopItems(logs, k); + + // Compute softmax over logits + const probabilities = softmax(topLogits.map((x) => x[1])); + + return Array.from({ length: this.generation_config.num_beams }, (_, i) => { + return [ + topLogits[i][0], // token id + Math.log(probabilities[i]), // score + ]; + }); + } } diff --git a/core/vendor/modules/@xenova/transformers/src/utils/hub.js b/core/vendor/modules/@xenova/transformers/src/utils/hub.js index 93617674c..8aa3b7997 100644 --- a/core/vendor/modules/@xenova/transformers/src/utils/hub.js +++ b/core/vendor/modules/@xenova/transformers/src/utils/hub.js @@ -1,24 +1,23 @@ - /** * @file Utility functions to interact with the Hugging Face Hub (https://huggingface.co/models) - * + * * @module utils/hub */ -import fs from 'fs'; -import path from 'path'; -import stream from 'stream/web'; +import fs from "fs"; +import path from "path"; +import stream from "stream/web"; -import { env } from '../env.js'; -import { dispatchCallback } from './core.js'; +import { env } from "../env.js"; +import { dispatchCallback } from "./core.js"; if (!globalThis.ReadableStream) { - // @ts-ignore - globalThis.ReadableStream = stream.ReadableStream; // ReadableStream is not a global with Node 16 + // @ts-ignore + globalThis.ReadableStream = stream.ReadableStream; // ReadableStream is not a global with Node 16 } /** - * @typedef {Object} PretrainedOptions Options for loading a pretrained model. + * @typedef {Object} PretrainedOptions Options for loading a pretrained model. * @property {boolean?} [quantized=true] Whether to load the 8-bit quantized version of the model (only applicable when loading model files). * @property {function} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates. * @property {Object} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when: @@ -33,121 +32,124 @@ if (!globalThis.ReadableStream) { */ class FileResponse { - /** - * Mapping from file extensions to MIME types. - */ - _CONTENT_TYPE_MAP = { - 'txt': 'text/plain', - 'html': 'text/html', - 'css': 'text/css', - 'js': 'text/javascript', - 'json': 'application/json', - 'png': 'image/png', - 'jpg': 'image/jpeg', - 'jpeg': 'image/jpeg', - 'gif': 'image/gif', + /** + * Mapping from file extensions to MIME types. + */ + _CONTENT_TYPE_MAP = { + txt: "text/plain", + html: "text/html", + css: "text/css", + js: "text/javascript", + json: "application/json", + png: "image/png", + jpg: "image/jpeg", + jpeg: "image/jpeg", + gif: "image/gif", + }; + /** + * Creates a new `FileResponse` object. + * @param {string|URL} filePath + */ + constructor(filePath) { + this.filePath = filePath; + this.headers = new Headers(); + + this.exists = fs.existsSync(filePath); + if (this.exists) { + this.status = 200; + this.statusText = "OK"; + + let stats = fs.statSync(filePath); + this.headers.set("content-length", stats.size.toString()); + + this.updateContentType(); + + let self = this; + this.body = new ReadableStream({ + start(controller) { + self.arrayBuffer().then((buffer) => { + controller.enqueue(new Uint8Array(buffer)); + controller.close(); + }); + }, + }); + } else { + this.status = 404; + this.statusText = "Not Found"; + this.body = null; } - /** - * Creates a new `FileResponse` object. - * @param {string|URL} filePath - */ - constructor(filePath) { - this.filePath = filePath; - this.headers = new Headers(); + } - this.exists = fs.existsSync(filePath); - if (this.exists) { - this.status = 200; - this.statusText = 'OK'; + /** + * Updates the 'content-type' header property of the response based on the extension of + * the file specified by the filePath property of the current object. + * @returns {void} + */ + updateContentType() { + // Set content-type header based on file extension + const extension = this.filePath.toString().split(".").pop().toLowerCase(); + this.headers.set( + "content-type", + this._CONTENT_TYPE_MAP[extension] ?? "application/octet-stream", + ); + } - let stats = fs.statSync(filePath); - this.headers.set('content-length', stats.size.toString()); + /** + * Clone the current FileResponse object. + * @returns {FileResponse} A new FileResponse object with the same properties as the current object. + */ + clone() { + let response = new FileResponse(this.filePath); + response.exists = this.exists; + response.status = this.status; + response.statusText = this.statusText; + response.headers = new Headers(this.headers); + return response; + } - this.updateContentType(); + /** + * Reads the contents of the file specified by the filePath property and returns a Promise that + * resolves with an ArrayBuffer containing the file's contents. + * @returns {Promise} A Promise that resolves with an ArrayBuffer containing the file's contents. + * @throws {Error} If the file cannot be read. + */ + async arrayBuffer() { + const data = await fs.promises.readFile(this.filePath); + return data.buffer; + } - let self = this; - this.body = new ReadableStream({ - start(controller) { - self.arrayBuffer().then(buffer => { - controller.enqueue(new Uint8Array(buffer)); - controller.close(); - }) - } - }); - } else { - this.status = 404; - this.statusText = 'Not Found'; - this.body = null; - } - } + /** + * Reads the contents of the file specified by the filePath property and returns a Promise that + * resolves with a Blob containing the file's contents. + * @returns {Promise} A Promise that resolves with a Blob containing the file's contents. + * @throws {Error} If the file cannot be read. + */ + async blob() { + const data = await fs.promises.readFile(this.filePath); + return new Blob([data], { type: this.headers.get("content-type") }); + } - /** - * Updates the 'content-type' header property of the response based on the extension of - * the file specified by the filePath property of the current object. - * @returns {void} - */ - updateContentType() { - // Set content-type header based on file extension - const extension = this.filePath.toString().split('.').pop().toLowerCase(); - this.headers.set('content-type', this._CONTENT_TYPE_MAP[extension] ?? 'application/octet-stream'); - } + /** + * Reads the contents of the file specified by the filePath property and returns a Promise that + * resolves with a string containing the file's contents. + * @returns {Promise} A Promise that resolves with a string containing the file's contents. + * @throws {Error} If the file cannot be read. + */ + async text() { + const data = await fs.promises.readFile(this.filePath, "utf8"); + return data; + } - /** - * Clone the current FileResponse object. - * @returns {FileResponse} A new FileResponse object with the same properties as the current object. - */ - clone() { - let response = new FileResponse(this.filePath); - response.exists = this.exists; - response.status = this.status; - response.statusText = this.statusText; - response.headers = new Headers(this.headers); - return response; - } - - /** - * Reads the contents of the file specified by the filePath property and returns a Promise that - * resolves with an ArrayBuffer containing the file's contents. - * @returns {Promise} A Promise that resolves with an ArrayBuffer containing the file's contents. - * @throws {Error} If the file cannot be read. - */ - async arrayBuffer() { - const data = await fs.promises.readFile(this.filePath); - return data.buffer; - } - - /** - * Reads the contents of the file specified by the filePath property and returns a Promise that - * resolves with a Blob containing the file's contents. - * @returns {Promise} A Promise that resolves with a Blob containing the file's contents. - * @throws {Error} If the file cannot be read. - */ - async blob() { - const data = await fs.promises.readFile(this.filePath); - return new Blob([data], { type: this.headers.get('content-type') }); - } - - /** - * Reads the contents of the file specified by the filePath property and returns a Promise that - * resolves with a string containing the file's contents. - * @returns {Promise} A Promise that resolves with a string containing the file's contents. - * @throws {Error} If the file cannot be read. - */ - async text() { - const data = await fs.promises.readFile(this.filePath, 'utf8'); - return data; - } - - /** - * Reads the contents of the file specified by the filePath property and returns a Promise that - * resolves with a parsed JavaScript object containing the file's contents. - * - * @returns {Promise} A Promise that resolves with a parsed JavaScript object containing the file's contents. - * @throws {Error} If the file cannot be read. - */ - async json() { - return JSON.parse(await this.text()); - } + /** + * Reads the contents of the file specified by the filePath property and returns a Promise that + * resolves with a parsed JavaScript object containing the file's contents. + * + * @returns {Promise} A Promise that resolves with a parsed JavaScript object containing the file's contents. + * @throws {Error} If the file cannot be read. + */ + async json() { + return JSON.parse(await this.text()); + } } /** @@ -157,17 +159,17 @@ class FileResponse { * @returns {boolean} True if the string is a valid HTTP or HTTPS URL, false otherwise. */ function isValidHttpUrl(string, validHosts = null) { - // https://stackoverflow.com/a/43467144 - let url; - try { - url = new URL(string); - } catch (_) { - return false; - } - if (validHosts && !validHosts.includes(url.hostname)) { - return false; - } - return url.protocol === "http:" || url.protocol === "https:"; + // https://stackoverflow.com/a/43467144 + let url; + try { + url = new URL(string); + } catch (_) { + return false; + } + if (validHosts && !validHosts.includes(url.hostname)) { + return false; + } + return url.protocol === "http:" || url.protocol === "https:"; } /** @@ -177,51 +179,52 @@ function isValidHttpUrl(string, validHosts = null) { * @returns {Promise} A promise that resolves to a FileResponse object (if the file is retrieved using the FileSystem API), or a Response object (if the file is retrieved using the Fetch API). */ export async function getFile(urlOrPath) { + if (env.useFS && !isValidHttpUrl(urlOrPath)) { + return new FileResponse(urlOrPath); + } else if ( + typeof process !== "undefined" && + process?.release?.name === "node" + ) { + const IS_CI = !!process.env?.TESTING_REMOTELY; + const version = env.version; - if (env.useFS && !isValidHttpUrl(urlOrPath)) { - return new FileResponse(urlOrPath); + const headers = new Headers(); + headers.set("User-Agent", `transformers.js/${version}; is_ci/${IS_CI};`); - } else if (typeof process !== 'undefined' && process?.release?.name === 'node') { - const IS_CI = !!process.env?.TESTING_REMOTELY; - const version = env.version; - - const headers = new Headers(); - headers.set('User-Agent', `transformers.js/${version}; is_ci/${IS_CI};`); - - // Check whether we are making a request to the Hugging Face Hub. - const isHFURL = isValidHttpUrl(urlOrPath, ['huggingface.co', 'hf.co']); - if (isHFURL) { - // If an access token is present in the environment variables, - // we add it to the request headers. - // NOTE: We keep `HF_ACCESS_TOKEN` for backwards compatibility (as a fallback). - const token = process.env?.HF_TOKEN ?? process.env?.HF_ACCESS_TOKEN; - if (token) { - headers.set('Authorization', `Bearer ${token}`); - } - } - return fetch(urlOrPath, { headers }); - } else { - // Running in a browser-environment, so we use default headers - // NOTE: We do not allow passing authorization headers in the browser, - // since this would require exposing the token to the client. - return fetch(urlOrPath); + // Check whether we are making a request to the Hugging Face Hub. + const isHFURL = isValidHttpUrl(urlOrPath, ["huggingface.co", "hf.co"]); + if (isHFURL) { + // If an access token is present in the environment variables, + // we add it to the request headers. + // NOTE: We keep `HF_ACCESS_TOKEN` for backwards compatibility (as a fallback). + const token = process.env?.HF_TOKEN ?? process.env?.HF_ACCESS_TOKEN; + if (token) { + headers.set("Authorization", `Bearer ${token}`); + } } + return fetch(urlOrPath, { headers }); + } else { + // Running in a browser-environment, so we use default headers + // NOTE: We do not allow passing authorization headers in the browser, + // since this would require exposing the token to the client. + return fetch(urlOrPath); + } } const ERROR_MAPPING = { - // 4xx errors (https://developer.mozilla.org/en-US/docs/Web/HTTP/Status#client_error_responses) - 400: 'Bad request error occurred while trying to load file', - 401: 'Unauthorized access to file', - 403: 'Forbidden access to file', - 404: 'Could not locate file', - 408: 'Request timeout error occurred while trying to load file', + // 4xx errors (https://developer.mozilla.org/en-US/docs/Web/HTTP/Status#client_error_responses) + 400: "Bad request error occurred while trying to load file", + 401: "Unauthorized access to file", + 403: "Forbidden access to file", + 404: "Could not locate file", + 408: "Request timeout error occurred while trying to load file", - // 5xx errors (https://developer.mozilla.org/en-US/docs/Web/HTTP/Status#server_error_responses) - 500: 'Internal server error error occurred while trying to load file', - 502: 'Bad gateway error occurred while trying to load file', - 503: 'Service unavailable error occurred while trying to load file', - 504: 'Gateway timeout error occurred while trying to load file', -} + // 5xx errors (https://developer.mozilla.org/en-US/docs/Web/HTTP/Status#server_error_responses) + 500: "Internal server error error occurred while trying to load file", + 502: "Bad gateway error occurred while trying to load file", + 503: "Service unavailable error occurred while trying to load file", + 504: "Gateway timeout error occurred while trying to load file", +}; /** * Helper method to handle fatal errors that occur while trying to load a file from the Hugging Face Hub. * @param {number} status The HTTP status code of the error. @@ -231,334 +234,355 @@ const ERROR_MAPPING = { * @throws {Error} If `fatal = false`. */ function handleError(status, remoteURL, fatal) { - if (!fatal) { - // File was not loaded correctly, but it is optional. - // TODO in future, cache the response? - return null; - } + if (!fatal) { + // File was not loaded correctly, but it is optional. + // TODO in future, cache the response? + return null; + } - const message = ERROR_MAPPING[status] ?? `Error (${status}) occurred while trying to load file`; - throw Error(`${message}: "${remoteURL}".`); + const message = + ERROR_MAPPING[status] ?? + `Error (${status}) occurred while trying to load file`; + throw Error(`${message}: "${remoteURL}".`); } class FileCache { - /** - * Instantiate a `FileCache` object. - * @param {string} path - */ - constructor(path) { - this.path = path; + /** + * Instantiate a `FileCache` object. + * @param {string} path + */ + constructor(path) { + this.path = path; + } + + /** + * Checks whether the given request is in the cache. + * @param {string} request + * @returns {Promise} + */ + async match(request) { + let filePath = path.join(this.path, request); + let file = new FileResponse(filePath); + + if (file.exists) { + return file; + } else { + return undefined; } + } - /** - * Checks whether the given request is in the cache. - * @param {string} request - * @returns {Promise} - */ - async match(request) { + /** + * Adds the given response to the cache. + * @param {string} request + * @param {Response|FileResponse} response + * @returns {Promise} + */ + async put(request, response) { + const buffer = Buffer.from(await response.arrayBuffer()); - let filePath = path.join(this.path, request); - let file = new FileResponse(filePath); + let outputPath = path.join(this.path, request); - if (file.exists) { - return file; - } else { - return undefined; - } + try { + await fs.promises.mkdir(path.dirname(outputPath), { recursive: true }); + await fs.promises.writeFile(outputPath, buffer); + } catch (err) { + console.warn("An error occurred while writing the file to cache:", err); } + } - /** - * Adds the given response to the cache. - * @param {string} request - * @param {Response|FileResponse} response - * @returns {Promise} - */ - async put(request, response) { - const buffer = Buffer.from(await response.arrayBuffer()); - - let outputPath = path.join(this.path, request); - - try { - await fs.promises.mkdir(path.dirname(outputPath), { recursive: true }); - await fs.promises.writeFile(outputPath, buffer); - - } catch (err) { - console.warn('An error occurred while writing the file to cache:', err) - } - } - - // TODO add the rest? - // addAll(requests: RequestInfo[]): Promise; - // delete(request: RequestInfo | URL, options?: CacheQueryOptions): Promise; - // keys(request?: RequestInfo | URL, options?: CacheQueryOptions): Promise>; - // match(request: RequestInfo | URL, options?: CacheQueryOptions): Promise; - // matchAll(request?: RequestInfo | URL, options?: CacheQueryOptions): Promise>; + // TODO add the rest? + // addAll(requests: RequestInfo[]): Promise; + // delete(request: RequestInfo | URL, options?: CacheQueryOptions): Promise; + // keys(request?: RequestInfo | URL, options?: CacheQueryOptions): Promise>; + // match(request: RequestInfo | URL, options?: CacheQueryOptions): Promise; + // matchAll(request?: RequestInfo | URL, options?: CacheQueryOptions): Promise>; } /** - * + * * @param {FileCache|Cache} cache The cache to search * @param {string[]} names The names of the item to search for * @returns {Promise} The item from the cache, or undefined if not found. */ async function tryCache(cache, ...names) { - for (let name of names) { - try { - let result = await cache.match(name); - if (result) return result; - } catch (e) { - continue; - } + for (let name of names) { + try { + let result = await cache.match(name); + if (result) return result; + } catch (e) { + continue; } - return undefined; + } + return undefined; } /** - * + * * Retrieves a file from either a remote URL using the Fetch API or from the local file system using the FileSystem API. * If the filesystem is available and `env.useCache = true`, the file will be downloaded and cached. - * + * * @param {string} path_or_repo_id This can be either: * - a string, the *model id* of a model repo on huggingface.co. * - a path to a *directory* potentially containing the file. * @param {string} filename The name of the file to locate in `path_or_repo`. * @param {boolean} [fatal=true] Whether to throw an error if the file is not found. * @param {PretrainedOptions} [options] An object containing optional parameters. - * + * * @throws Will throw an error if the file is not found and `fatal` is true. * @returns {Promise} A Promise that resolves with the file content as a buffer. */ -export async function getModelFile(path_or_repo_id, filename, fatal = true, options = {}) { +export async function getModelFile( + path_or_repo_id, + filename, + fatal = true, + options = {}, +) { + if (!env.allowLocalModels) { + // User has disabled local models, so we just make sure other settings are correct. - if (!env.allowLocalModels) { - // User has disabled local models, so we just make sure other settings are correct. + if (options.local_files_only) { + throw Error( + "Invalid configuration detected: local models are disabled (`env.allowLocalModels=false`) but you have requested to only use local models (`local_files_only=true`).", + ); + } else if (!env.allowRemoteModels) { + throw Error( + "Invalid configuration detected: both local and remote models are disabled. Fix by setting `env.allowLocalModels` or `env.allowRemoteModels` to `true`.", + ); + } + } - if (options.local_files_only) { - throw Error("Invalid configuration detected: local models are disabled (`env.allowLocalModels=false`) but you have requested to only use local models (`local_files_only=true`).") - } else if (!env.allowRemoteModels) { - throw Error("Invalid configuration detected: both local and remote models are disabled. Fix by setting `env.allowLocalModels` or `env.allowRemoteModels` to `true`.") - } + // Initiate file retrieval + dispatchCallback(options.progress_callback, { + status: "initiate", + name: path_or_repo_id, + file: filename, + }); + + // First, check if the a caching backend is available + // If no caching mechanism available, will download the file every time + let cache; + if (!cache && env.useBrowserCache) { + if (typeof caches === "undefined") { + throw Error("Browser cache is not available in this environment."); + } + try { + // In some cases, the browser cache may be visible, but not accessible due to security restrictions. + // For example, when running an application in an iframe, if a user attempts to load the page in + // incognito mode, the following error is thrown: `DOMException: Failed to execute 'open' on 'CacheStorage': + // An attempt was made to break through the security policy of the user agent.` + // So, instead of crashing, we just ignore the error and continue without using the cache. + cache = await caches.open("transformers-cache"); + } catch (e) { + console.warn("An error occurred while opening the browser cache:", e); + } + } + + if (!cache && env.useFSCache) { + // TODO throw error if not available + + // If `cache_dir` is not specified, use the default cache directory + cache = new FileCache(options.cache_dir ?? env.cacheDir); + } + + if (!cache && env.useCustomCache) { + // Allow the user to specify a custom cache system. + if (!env.customCache) { + throw Error( + "`env.useCustomCache=true`, but `env.customCache` is not defined.", + ); } - // Initiate file retrieval - dispatchCallback(options.progress_callback, { - status: 'initiate', - name: path_or_repo_id, - file: filename - }) + // Check that the required methods are defined: + if (!env.customCache.match || !env.customCache.put) { + throw new Error( + "`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. " + + "For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache", + ); + } + cache = env.customCache; + } - // First, check if the a caching backend is available - // If no caching mechanism available, will download the file every time - let cache; - if (!cache && env.useBrowserCache) { - if (typeof caches === 'undefined') { - throw Error('Browser cache is not available in this environment.') - } + const revision = options.revision ?? "main"; + + let requestURL = pathJoin(path_or_repo_id, filename); + let localPath = pathJoin(env.localModelPath, requestURL); + + let remoteURL = pathJoin( + env.remoteHost, + env.remotePathTemplate + .replaceAll("{model}", path_or_repo_id) + .replaceAll("{revision}", encodeURIComponent(revision)), + filename, + ); + + // Choose cache key for filesystem cache + // When using the main revision (default), we use the request URL as the cache key. + // If a specific revision is requested, we account for this in the cache key. + let fsCacheKey = + revision === "main" + ? requestURL + : pathJoin(path_or_repo_id, revision, filename); + + /** @type {string} */ + let cacheKey; + let proposedCacheKey = cache instanceof FileCache ? fsCacheKey : remoteURL; + + // Whether to cache the final response in the end. + let toCacheResponse = false; + + /** @type {Response|FileResponse|undefined} */ + let response; + + if (cache) { + // A caching system is available, so we try to get the file from it. + // 1. We first try to get from cache using the local path. In some environments (like deno), + // non-URL cache keys are not allowed. In these cases, `response` will be undefined. + // 2. If no response is found, we try to get from cache using the remote URL or file system cache. + response = await tryCache(cache, localPath, proposedCacheKey); + } + + const cacheHit = response !== undefined; + + if (response === undefined) { + // Caching not available, or file is not cached, so we perform the request + + if (env.allowLocalModels) { + // Accessing local models is enabled, so we try to get the file locally. + // If request is a valid HTTP URL, we skip the local file check. Otherwise, we try to get the file locally. + const isURL = isValidHttpUrl(requestURL); + if (!isURL) { try { - // In some cases, the browser cache may be visible, but not accessible due to security restrictions. - // For example, when running an application in an iframe, if a user attempts to load the page in - // incognito mode, the following error is thrown: `DOMException: Failed to execute 'open' on 'CacheStorage': - // An attempt was made to break through the security policy of the user agent.` - // So, instead of crashing, we just ignore the error and continue without using the cache. - cache = await caches.open('transformers-cache'); + response = await getFile(localPath); + cacheKey = localPath; // Update the cache key to be the local path } catch (e) { - console.warn('An error occurred while opening the browser cache:', e); + // Something went wrong while trying to get the file locally. + // NOTE: error handling is done in the next step (since `response` will be undefined) + console.warn(`Unable to load from local path "${localPath}": "${e}"`); } + } else if (options.local_files_only) { + throw new Error( + `\`local_files_only=true\`, but attempted to load a remote file from: ${requestURL}.`, + ); + } else if (!env.allowRemoteModels) { + throw new Error( + `\`env.allowRemoteModels=false\`, but attempted to load a remote file from: ${requestURL}.`, + ); + } } - if (!cache && env.useFSCache) { - // TODO throw error if not available + if (response === undefined || response.status === 404) { + // File not found locally. This means either: + // - The user has disabled local file access (`env.allowLocalModels=false`) + // - the path is a valid HTTP url (`response === undefined`) + // - the path is not a valid HTTP url and the file is not present on the file system or local server (`response.status === 404`) - // If `cache_dir` is not specified, use the default cache directory - cache = new FileCache(options.cache_dir ?? env.cacheDir); + if (options.local_files_only || !env.allowRemoteModels) { + // User requested local files only, but the file is not found locally. + if (fatal) { + throw Error( + `\`local_files_only=true\` or \`env.allowRemoteModels=false\` and file was not found locally at "${localPath}".`, + ); + } else { + // File not found, but this file is optional. + // TODO in future, cache the response? + return null; + } + } + + // File not found locally, so we try to download it from the remote server + response = await getFile(remoteURL); + + if (response.status !== 200) { + return handleError(response.status, remoteURL, fatal); + } + + // Success! We use the proposed cache key from earlier + cacheKey = proposedCacheKey; } - if (!cache && env.useCustomCache) { - // Allow the user to specify a custom cache system. - if (!env.customCache) { - throw Error('`env.useCustomCache=true`, but `env.customCache` is not defined.') - } + // Only cache the response if: + toCacheResponse = + cache && // 1. A caching system is available + typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment) + response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`) + response.status === 200; // 4. request was successful (status code 200) + } - // Check that the required methods are defined: - if (!env.customCache.match || !env.customCache.put) { - throw new Error( - "`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. " + - "For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache" - ) - } - cache = env.customCache; - } + // Start downloading + dispatchCallback(options.progress_callback, { + status: "download", + name: path_or_repo_id, + file: filename, + }); - const revision = options.revision ?? 'main'; + const progressInfo = { + status: "progress", + name: path_or_repo_id, + file: filename, + }; - let requestURL = pathJoin(path_or_repo_id, filename); - let localPath = pathJoin(env.localModelPath, requestURL); + /** @type {Uint8Array} */ + let buffer; - let remoteURL = pathJoin( - env.remoteHost, - env.remotePathTemplate - .replaceAll('{model}', path_or_repo_id) - .replaceAll('{revision}', encodeURIComponent(revision)), - filename - ); + if (!options.progress_callback) { + // If no progress callback is specified, we can use the `.arrayBuffer()` + // method to read the response. + buffer = new Uint8Array(await response.arrayBuffer()); + } else if ( + cacheHit && // The item is being read from the cache + typeof navigator !== "undefined" && + /firefox/i.test(navigator.userAgent) // We are in Firefox + ) { + // Due to bug in Firefox, we cannot display progress when loading from cache. + // Fortunately, since this should be instantaneous, this should not impact users too much. + buffer = new Uint8Array(await response.arrayBuffer()); - // Choose cache key for filesystem cache - // When using the main revision (default), we use the request URL as the cache key. - // If a specific revision is requested, we account for this in the cache key. - let fsCacheKey = revision === 'main' ? requestURL : pathJoin(path_or_repo_id, revision, filename); - - /** @type {string} */ - let cacheKey; - let proposedCacheKey = cache instanceof FileCache ? fsCacheKey : remoteURL; - - // Whether to cache the final response in the end. - let toCacheResponse = false; - - /** @type {Response|FileResponse|undefined} */ - let response; - - if (cache) { - // A caching system is available, so we try to get the file from it. - // 1. We first try to get from cache using the local path. In some environments (like deno), - // non-URL cache keys are not allowed. In these cases, `response` will be undefined. - // 2. If no response is found, we try to get from cache using the remote URL or file system cache. - response = await tryCache(cache, localPath, proposedCacheKey); - } - - const cacheHit = response !== undefined; - - if (response === undefined) { - // Caching not available, or file is not cached, so we perform the request - - if (env.allowLocalModels) { - // Accessing local models is enabled, so we try to get the file locally. - // If request is a valid HTTP URL, we skip the local file check. Otherwise, we try to get the file locally. - const isURL = isValidHttpUrl(requestURL); - if (!isURL) { - try { - response = await getFile(localPath); - cacheKey = localPath; // Update the cache key to be the local path - } catch (e) { - // Something went wrong while trying to get the file locally. - // NOTE: error handling is done in the next step (since `response` will be undefined) - console.warn(`Unable to load from local path "${localPath}": "${e}"`); - } - } else if (options.local_files_only) { - throw new Error(`\`local_files_only=true\`, but attempted to load a remote file from: ${requestURL}.`); - } else if (!env.allowRemoteModels) { - throw new Error(`\`env.allowRemoteModels=false\`, but attempted to load a remote file from: ${requestURL}.`); - } - } - - if (response === undefined || response.status === 404) { - // File not found locally. This means either: - // - The user has disabled local file access (`env.allowLocalModels=false`) - // - the path is a valid HTTP url (`response === undefined`) - // - the path is not a valid HTTP url and the file is not present on the file system or local server (`response.status === 404`) - - if (options.local_files_only || !env.allowRemoteModels) { - // User requested local files only, but the file is not found locally. - if (fatal) { - throw Error(`\`local_files_only=true\` or \`env.allowRemoteModels=false\` and file was not found locally at "${localPath}".`); - } else { - // File not found, but this file is optional. - // TODO in future, cache the response? - return null; - } - } - - // File not found locally, so we try to download it from the remote server - response = await getFile(remoteURL); - - if (response.status !== 200) { - return handleError(response.status, remoteURL, fatal); - } - - // Success! We use the proposed cache key from earlier - cacheKey = proposedCacheKey; - } - - // Only cache the response if: - toCacheResponse = - cache // 1. A caching system is available - && typeof Response !== 'undefined' // 2. `Response` is defined (i.e., we are in a browser-like environment) - && response instanceof Response // 3. result is a `Response` object (i.e., not a `FileResponse`) - && response.status === 200 // 4. request was successful (status code 200) - } - - // Start downloading + // For completeness, we still fire the final progress callback dispatchCallback(options.progress_callback, { - status: 'download', - name: path_or_repo_id, - file: filename - }) - - const progressInfo = { - status: 'progress', - name: path_or_repo_id, - file: filename - } - - /** @type {Uint8Array} */ - let buffer; - - if (!options.progress_callback) { - // If no progress callback is specified, we can use the `.arrayBuffer()` - // method to read the response. - buffer = new Uint8Array(await response.arrayBuffer()); - - } else if ( - cacheHit // The item is being read from the cache - && - typeof navigator !== 'undefined' && /firefox/i.test(navigator.userAgent) // We are in Firefox - ) { - // Due to bug in Firefox, we cannot display progress when loading from cache. - // Fortunately, since this should be instantaneous, this should not impact users too much. - buffer = new Uint8Array(await response.arrayBuffer()); - - // For completeness, we still fire the final progress callback - dispatchCallback(options.progress_callback, { - ...progressInfo, - progress: 100, - loaded: buffer.length, - total: buffer.length, - }) - } else { - buffer = await readResponse(response, data => { - dispatchCallback(options.progress_callback, { - ...progressInfo, - ...data, - }) - }) - } - - if ( - // Only cache web responses - // i.e., do not cache FileResponses (prevents duplication) - toCacheResponse && cacheKey - && - // Check again whether request is in cache. If not, we add the response to the cache - (await cache.match(cacheKey) === undefined) - ) { - // NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files - await cache.put(cacheKey, new Response(buffer, { - headers: response.headers - })) - .catch(err => { - // Do not crash if unable to add to cache (e.g., QuotaExceededError). - // Rather, log a warning and proceed with execution. - console.warn(`Unable to add response to browser cache: ${err}.`); - }); - - } - - dispatchCallback(options.progress_callback, { - status: 'done', - name: path_or_repo_id, - file: filename + ...progressInfo, + progress: 100, + loaded: buffer.length, + total: buffer.length, }); + } else { + buffer = await readResponse(response, (data) => { + dispatchCallback(options.progress_callback, { + ...progressInfo, + ...data, + }); + }); + } - return buffer; + if ( + // Only cache web responses + // i.e., do not cache FileResponses (prevents duplication) + toCacheResponse && + cacheKey && + // Check again whether request is in cache. If not, we add the response to the cache + (await cache.match(cacheKey)) === undefined + ) { + // NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files + await cache + .put( + cacheKey, + new Response(buffer, { + headers: response.headers, + }), + ) + .catch((err) => { + // Do not crash if unable to add to cache (e.g., QuotaExceededError). + // Rather, log a warning and proceed with execution. + console.warn(`Unable to add response to browser cache: ${err}.`); + }); + } + + dispatchCallback(options.progress_callback, { + status: "done", + name: path_or_repo_id, + file: filename, + }); + + return buffer; } /** @@ -571,17 +595,22 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti * @returns {Promise} The JSON data parsed into a JavaScript object. * @throws Will throw an error if the file is not found and `fatal` is true. */ -export async function getModelJSON(modelPath, fileName, fatal = true, options = {}) { - let buffer = await getModelFile(modelPath, fileName, fatal, options); - if (buffer === null) { - // Return empty object - return {} - } +export async function getModelJSON( + modelPath, + fileName, + fatal = true, + options = {}, +) { + let buffer = await getModelFile(modelPath, fileName, fatal, options); + if (buffer === null) { + // Return empty object + return {}; + } - let decoder = new TextDecoder('utf-8'); - let jsonData = decoder.decode(buffer); + let decoder = new TextDecoder("utf-8"); + let jsonData = decoder.decode(buffer); - return JSON.parse(jsonData); + return JSON.parse(jsonData); } /** @@ -592,52 +621,53 @@ export async function getModelJSON(modelPath, fileName, fatal = true, options = * @returns {Promise} A Promise that resolves with the Uint8Array buffer */ async function readResponse(response, progress_callback) { + const contentLength = response.headers.get("Content-Length"); + if (contentLength === null) { + console.warn( + "Unable to determine content-length from response headers. Will expand buffer when needed.", + ); + } + let total = parseInt(contentLength ?? "0"); + let buffer = new Uint8Array(total); + let loaded = 0; - const contentLength = response.headers.get('Content-Length'); - if (contentLength === null) { - console.warn('Unable to determine content-length from response headers. Will expand buffer when needed.') + const reader = response.body.getReader(); + async function read() { + const { done, value } = await reader.read(); + if (done) return; + + let newLoaded = loaded + value.length; + if (newLoaded > total) { + total = newLoaded; + + // Adding the new data will overflow buffer. + // In this case, we extend the buffer + let newBuffer = new Uint8Array(total); + + // copy contents + newBuffer.set(buffer); + + buffer = newBuffer; } - let total = parseInt(contentLength ?? '0'); - let buffer = new Uint8Array(total); - let loaded = 0; + buffer.set(value, loaded); + loaded = newLoaded; - const reader = response.body.getReader(); - async function read() { - const { done, value } = await reader.read(); - if (done) return; + const progress = (loaded / total) * 100; - let newLoaded = loaded + value.length; - if (newLoaded > total) { - total = newLoaded; + // Call your function here + progress_callback({ + progress: progress, + loaded: loaded, + total: total, + }); - // Adding the new data will overflow buffer. - // In this case, we extend the buffer - let newBuffer = new Uint8Array(total); + return read(); + } - // copy contents - newBuffer.set(buffer); + // Actually read + await read(); - buffer = newBuffer; - } - buffer.set(value, loaded) - loaded = newLoaded; - - const progress = (loaded / total) * 100; - - // Call your function here - progress_callback({ - progress: progress, - loaded: loaded, - total: total, - }) - - return read(); - } - - // Actually read - await read(); - - return buffer; + return buffer; } /** @@ -647,15 +677,15 @@ async function readResponse(response, progress_callback) { * @returns {string} A string representing the joined path. */ function pathJoin(...parts) { - // https://stackoverflow.com/a/55142565 - parts = parts.map((part, index) => { - if (index) { - part = part.replace(new RegExp('^/'), ''); - } - if (index !== parts.length - 1) { - part = part.replace(new RegExp('/$'), ''); - } - return part; - }) - return parts.join('/'); + // https://stackoverflow.com/a/55142565 + parts = parts.map((part, index) => { + if (index) { + part = part.replace(new RegExp("^/"), ""); + } + if (index !== parts.length - 1) { + part = part.replace(new RegExp("/$"), ""); + } + return part; + }); + return parts.join("/"); } diff --git a/core/vendor/modules/@xenova/transformers/src/utils/image.js b/core/vendor/modules/@xenova/transformers/src/utils/image.js index 2d12cb876..270469177 100644 --- a/core/vendor/modules/@xenova/transformers/src/utils/image.js +++ b/core/vendor/modules/@xenova/transformers/src/utils/image.js @@ -1,713 +1,794 @@ - /** - * @file Helper module for image processing. - * - * These functions and classes are only used internally, + * @file Helper module for image processing. + * + * These functions and classes are only used internally, * meaning an end-user shouldn't need to access anything here. - * + * * @module utils/image */ -import { getFile } from './hub.js'; -import { env } from '../env.js'; +import { getFile } from "./hub.js"; +import { env } from "../env.js"; // Will be empty (or not used) if running in browser or web-worker -import sharp from 'sharp'; +import sharp from "sharp"; -const BROWSER_ENV = typeof self !== 'undefined'; -const WEBWORKER_ENV = BROWSER_ENV && self.constructor.name === 'DedicatedWorkerGlobalScope'; +const BROWSER_ENV = typeof self !== "undefined"; +const WEBWORKER_ENV = + BROWSER_ENV && self.constructor.name === "DedicatedWorkerGlobalScope"; let createCanvasFunction; let ImageDataClass; let loadImageFunction; if (BROWSER_ENV) { - // Running in browser or web-worker - createCanvasFunction = (/** @type {number} */ width, /** @type {number} */ height) => { - if (!self.OffscreenCanvas) { - throw new Error('OffscreenCanvas not supported by this browser.'); - } - return new self.OffscreenCanvas(width, height) - }; - loadImageFunction = self.createImageBitmap; - ImageDataClass = self.ImageData; - -} else if (sharp) { - // Running in Node.js, electron, or other non-browser environment - - loadImageFunction = async (/**@type {sharp.Sharp}*/img) => { - const metadata = await img.metadata(); - const rawChannels = metadata.channels; - - let { data, info } = await img.raw().toBuffer({ resolveWithObject: true }); - - const newImage = new RawImage(new Uint8ClampedArray(data), info.width, info.height, info.channels); - if (rawChannels !== undefined && rawChannels !== info.channels) { - // Make sure the new image has the same number of channels as the input image. - // This is necessary for grayscale images. - newImage.convert(rawChannels); - } - return newImage; + // Running in browser or web-worker + createCanvasFunction = ( + /** @type {number} */ width, + /** @type {number} */ height, + ) => { + if (!self.OffscreenCanvas) { + throw new Error("OffscreenCanvas not supported by this browser."); } + return new self.OffscreenCanvas(width, height); + }; + loadImageFunction = self.createImageBitmap; + ImageDataClass = self.ImageData; +} else if (sharp) { + // Running in Node.js, electron, or other non-browser environment + loadImageFunction = async (/**@type {sharp.Sharp}*/ img) => { + const metadata = await img.metadata(); + const rawChannels = metadata.channels; + + let { data, info } = await img.raw().toBuffer({ resolveWithObject: true }); + + const newImage = new RawImage( + new Uint8ClampedArray(data), + info.width, + info.height, + info.channels, + ); + if (rawChannels !== undefined && rawChannels !== info.channels) { + // Make sure the new image has the same number of channels as the input image. + // This is necessary for grayscale images. + newImage.convert(rawChannels); + } + return newImage; + }; } else { - throw new Error('Unable to load image processing library.'); + throw new Error("Unable to load image processing library."); } - // Defined here: https://github.com/python-pillow/Pillow/blob/a405e8406b83f8bfb8916e93971edc7407b8b1ff/src/libImaging/Imaging.h#L262-L268 const RESAMPLING_MAPPING = { - 0: 'nearest', - 1: 'lanczos', - 2: 'bilinear', - 3: 'bicubic', - 4: 'box', - 5: 'hamming', -} + 0: "nearest", + 1: "lanczos", + 2: "bilinear", + 3: "bicubic", + 4: "box", + 5: "hamming", +}; /** * Mapping from file extensions to MIME types. */ const CONTENT_TYPE_MAP = new Map([ - ['png', 'image/png'], - ['jpg', 'image/jpeg'], - ['jpeg', 'image/jpeg'], - ['gif', 'image/gif'], + ["png", "image/png"], + ["jpg", "image/jpeg"], + ["jpeg", "image/jpeg"], + ["gif", "image/gif"], ]); export class RawImage { + /** + * Create a new `RawImage` object. + * @param {Uint8ClampedArray|Uint8Array} data The pixel data. + * @param {number} width The width of the image. + * @param {number} height The height of the image. + * @param {1|2|3|4} channels The number of channels. + */ + constructor(data, width, height, channels) { + this.data = data; + this.width = width; + this.height = height; + this.channels = channels; + } - /** - * Create a new `RawImage` object. - * @param {Uint8ClampedArray|Uint8Array} data The pixel data. - * @param {number} width The width of the image. - * @param {number} height The height of the image. - * @param {1|2|3|4} channels The number of channels. - */ - constructor(data, width, height, channels) { - this.data = data; - this.width = width; - this.height = height; - this.channels = channels; + /** + * Returns the size of the image (width, height). + * @returns {[number, number]} The size of the image (width, height). + */ + get size() { + return [this.width, this.height]; + } + + /** + * Helper method for reading an image from a variety of input types. + * @param {RawImage|string|URL} input + * @returns The image object. + * + * **Example:** Read image from a URL. + * ```javascript + * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); + * // RawImage { + * // "data": Uint8ClampedArray [ 25, 25, 25, 19, 19, 19, ... ], + * // "width": 800, + * // "height": 533, + * // "channels": 3 + * // } + * ``` + */ + static async read(input) { + if (input instanceof RawImage) { + return input; + } else if (typeof input === "string" || input instanceof URL) { + return await this.fromURL(input); + } else { + throw new Error(`Unsupported input type: ${typeof input}`); + } + } + + /** + * Read an image from a URL or file path. + * @param {string|URL} url The URL or file path to read the image from. + * @returns {Promise} The image object. + */ + static async fromURL(url) { + let response = await getFile(url); + if (response.status !== 200) { + throw new Error( + `Unable to read image from "${url}" (${response.status} ${response.statusText})`, + ); + } + let blob = await response.blob(); + return this.fromBlob(blob); + } + + /** + * Helper method to create a new Image from a blob. + * @param {Blob} blob The blob to read the image from. + * @returns {Promise} The image object. + */ + static async fromBlob(blob) { + if (BROWSER_ENV) { + // Running in environment with canvas + let img = await loadImageFunction(blob); + + const ctx = createCanvasFunction(img.width, img.height).getContext("2d"); + + // Draw image to context + ctx.drawImage(img, 0, 0); + + return new this( + ctx.getImageData(0, 0, img.width, img.height).data, + img.width, + img.height, + 4, + ); + } else { + // Use sharp.js to read (and possible resize) the image. + let img = sharp(await blob.arrayBuffer()); + + return await loadImageFunction(img); + } + } + + /** + * Helper method to create a new Image from a tensor + * @param {import('./tensor.js').Tensor} tensor + */ + static fromTensor(tensor, channel_format = "CHW") { + if (tensor.dims.length !== 3) { + throw new Error( + `Tensor should have 3 dimensions, but has ${tensor.dims.length} dimensions.`, + ); } - /** - * Returns the size of the image (width, height). - * @returns {[number, number]} The size of the image (width, height). - */ - get size() { - return [this.width, this.height]; + if (channel_format === "CHW") { + tensor = tensor.transpose(1, 2, 0); + } else if (channel_format === "HWC") { + // Do nothing + } else { + throw new Error(`Unsupported channel format: ${channel_format}`); + } + if ( + !( + tensor.data instanceof Uint8ClampedArray || + tensor.data instanceof Uint8Array + ) + ) { + throw new Error(`Unsupported tensor type: ${tensor.type}`); + } + switch (tensor.dims[2]) { + case 1: + case 2: + case 3: + case 4: + return new RawImage( + tensor.data, + tensor.dims[1], + tensor.dims[0], + tensor.dims[2], + ); + default: + throw new Error(`Unsupported number of channels: ${tensor.dims[2]}`); + } + } + + /** + * Convert the image to grayscale format. + * @returns {RawImage} `this` to support chaining. + */ + grayscale() { + if (this.channels === 1) { + return this; } - /** - * Helper method for reading an image from a variety of input types. - * @param {RawImage|string|URL} input - * @returns The image object. - * - * **Example:** Read image from a URL. - * ```javascript - * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); - * // RawImage { - * // "data": Uint8ClampedArray [ 25, 25, 25, 19, 19, 19, ... ], - * // "width": 800, - * // "height": 533, - * // "channels": 3 - * // } - * ``` - */ - static async read(input) { - if (input instanceof RawImage) { - return input; - } else if (typeof input === 'string' || input instanceof URL) { - return await this.fromURL(input); - } else { - throw new Error(`Unsupported input type: ${typeof input}`); + let newData = new Uint8ClampedArray(this.width * this.height * 1); + switch (this.channels) { + case 3: // rgb to grayscale + case 4: // rgba to grayscale + for (let i = 0, offset = 0; i < this.data.length; i += this.channels) { + const red = this.data[i]; + const green = this.data[i + 1]; + const blue = this.data[i + 2]; + + newData[offset++] = Math.round( + 0.2989 * red + 0.587 * green + 0.114 * blue, + ); } + break; + default: + throw new Error( + `Conversion failed due to unsupported number of channels: ${this.channels}`, + ); + } + return this._update(newData, this.width, this.height, 1); + } + + /** + * Convert the image to RGB format. + * @returns {RawImage} `this` to support chaining. + */ + rgb() { + if (this.channels === 3) { + return this; } + let newData = new Uint8ClampedArray(this.width * this.height * 3); - /** - * Read an image from a URL or file path. - * @param {string|URL} url The URL or file path to read the image from. - * @returns {Promise} The image object. - */ - static async fromURL(url) { - let response = await getFile(url); - if (response.status !== 200) { - throw new Error(`Unable to read image from "${url}" (${response.status} ${response.statusText})`); + switch (this.channels) { + case 1: // grayscale to rgb + for (let i = 0, offset = 0; i < this.data.length; ++i) { + newData[offset++] = this.data[i]; + newData[offset++] = this.data[i]; + newData[offset++] = this.data[i]; } - let blob = await response.blob(); - return this.fromBlob(blob); + break; + case 4: // rgba to rgb + for (let i = 0, offset = 0; i < this.data.length; i += 4) { + newData[offset++] = this.data[i]; + newData[offset++] = this.data[i + 1]; + newData[offset++] = this.data[i + 2]; + } + break; + default: + throw new Error( + `Conversion failed due to unsupported number of channels: ${this.channels}`, + ); + } + return this._update(newData, this.width, this.height, 3); + } + + /** + * Convert the image to RGBA format. + * @returns {RawImage} `this` to support chaining. + */ + rgba() { + if (this.channels === 4) { + return this; } - /** - * Helper method to create a new Image from a blob. - * @param {Blob} blob The blob to read the image from. - * @returns {Promise} The image object. - */ - static async fromBlob(blob) { - if (BROWSER_ENV) { - // Running in environment with canvas - let img = await loadImageFunction(blob); + let newData = new Uint8ClampedArray(this.width * this.height * 4); - const ctx = createCanvasFunction(img.width, img.height).getContext('2d'); - - // Draw image to context - ctx.drawImage(img, 0, 0); - - return new this(ctx.getImageData(0, 0, img.width, img.height).data, img.width, img.height, 4); - - } else { - // Use sharp.js to read (and possible resize) the image. - let img = sharp(await blob.arrayBuffer()); - - return await loadImageFunction(img); + switch (this.channels) { + case 1: // grayscale to rgba + for (let i = 0, offset = 0; i < this.data.length; ++i) { + newData[offset++] = this.data[i]; + newData[offset++] = this.data[i]; + newData[offset++] = this.data[i]; + newData[offset++] = 255; } + break; + case 3: // rgb to rgba + for (let i = 0, offset = 0; i < this.data.length; i += 3) { + newData[offset++] = this.data[i]; + newData[offset++] = this.data[i + 1]; + newData[offset++] = this.data[i + 2]; + newData[offset++] = 255; + } + break; + default: + throw new Error( + `Conversion failed due to unsupported number of channels: ${this.channels}`, + ); } - /** - * Helper method to create a new Image from a tensor - * @param {import('./tensor.js').Tensor} tensor - */ - static fromTensor(tensor, channel_format = 'CHW') { - if (tensor.dims.length !== 3) { - throw new Error(`Tensor should have 3 dimensions, but has ${tensor.dims.length} dimensions.`); - } + return this._update(newData, this.width, this.height, 4); + } - if (channel_format === 'CHW') { - tensor = tensor.transpose(1, 2, 0); - } else if (channel_format === 'HWC') { - // Do nothing - } else { - throw new Error(`Unsupported channel format: ${channel_format}`); - } - if (!(tensor.data instanceof Uint8ClampedArray || tensor.data instanceof Uint8Array)) { - throw new Error(`Unsupported tensor type: ${tensor.type}`); - } - switch (tensor.dims[2]) { - case 1: - case 2: - case 3: - case 4: - return new RawImage(tensor.data, tensor.dims[1], tensor.dims[0], tensor.dims[2]); - default: - throw new Error(`Unsupported number of channels: ${tensor.dims[2]}`); - } - } + /** + * Resize the image to the given dimensions. This method uses the canvas API to perform the resizing. + * @param {number} width The width of the new image. + * @param {number} height The height of the new image. + * @param {Object} options Additional options for resizing. + * @param {0|1|2|3|4|5|string} [options.resample] The resampling method to use. + * @returns {Promise} `this` to support chaining. + */ + async resize(width, height, { resample = 2 } = {}) { + // Ensure resample method is a string + let resampleMethod = RESAMPLING_MAPPING[resample] ?? resample; - /** - * Convert the image to grayscale format. - * @returns {RawImage} `this` to support chaining. - */ - grayscale() { - if (this.channels === 1) { - return this; - } + if (BROWSER_ENV) { + // TODO use `resample` in browser environment - let newData = new Uint8ClampedArray(this.width * this.height * 1); - switch (this.channels) { - case 3: // rgb to grayscale - case 4: // rgba to grayscale - for (let i = 0, offset = 0; i < this.data.length; i += this.channels) { - const red = this.data[i]; - const green = this.data[i + 1]; - const blue = this.data[i + 2]; + // Store number of channels before resizing + let numChannels = this.channels; - newData[offset++] = Math.round(0.2989 * red + 0.5870 * green + 0.1140 * blue); - } - break; - default: - throw new Error(`Conversion failed due to unsupported number of channels: ${this.channels}`); - } - return this._update(newData, this.width, this.height, 1); - } + // Create canvas object for this image + let canvas = this.toCanvas(); - /** - * Convert the image to RGB format. - * @returns {RawImage} `this` to support chaining. - */ - rgb() { - if (this.channels === 3) { - return this; - } + // Actually perform resizing using the canvas API + const ctx = createCanvasFunction(width, height).getContext("2d"); - let newData = new Uint8ClampedArray(this.width * this.height * 3); + // Draw image to context, resizing in the process + ctx.drawImage(canvas, 0, 0, width, height); - switch (this.channels) { - case 1: // grayscale to rgb - for (let i = 0, offset = 0; i < this.data.length; ++i) { - newData[offset++] = this.data[i]; - newData[offset++] = this.data[i]; - newData[offset++] = this.data[i]; - } - break; - case 4: // rgba to rgb - for (let i = 0, offset = 0; i < this.data.length; i += 4) { - newData[offset++] = this.data[i]; - newData[offset++] = this.data[i + 1]; - newData[offset++] = this.data[i + 2]; - } - break; - default: - throw new Error(`Conversion failed due to unsupported number of channels: ${this.channels}`); - } - return this._update(newData, this.width, this.height, 3); + // Create image from the resized data + let resizedImage = new RawImage( + ctx.getImageData(0, 0, width, height).data, + width, + height, + 4, + ); - } + // Convert back so that image has the same number of channels as before + return resizedImage.convert(numChannels); + } else { + // Create sharp image from raw data, and resize + let img = this.toSharp(); - /** - * Convert the image to RGBA format. - * @returns {RawImage} `this` to support chaining. - */ - rgba() { - if (this.channels === 4) { - return this; - } - - let newData = new Uint8ClampedArray(this.width * this.height * 4); - - switch (this.channels) { - case 1: // grayscale to rgba - for (let i = 0, offset = 0; i < this.data.length; ++i) { - newData[offset++] = this.data[i]; - newData[offset++] = this.data[i]; - newData[offset++] = this.data[i]; - newData[offset++] = 255; - } - break; - case 3: // rgb to rgba - for (let i = 0, offset = 0; i < this.data.length; i += 3) { - newData[offset++] = this.data[i]; - newData[offset++] = this.data[i + 1]; - newData[offset++] = this.data[i + 2]; - newData[offset++] = 255; - } - break; - default: - throw new Error(`Conversion failed due to unsupported number of channels: ${this.channels}`); - } - - return this._update(newData, this.width, this.height, 4); - } - - /** - * Resize the image to the given dimensions. This method uses the canvas API to perform the resizing. - * @param {number} width The width of the new image. - * @param {number} height The height of the new image. - * @param {Object} options Additional options for resizing. - * @param {0|1|2|3|4|5|string} [options.resample] The resampling method to use. - * @returns {Promise} `this` to support chaining. - */ - async resize(width, height, { - resample = 2, - } = {}) { - - // Ensure resample method is a string - let resampleMethod = RESAMPLING_MAPPING[resample] ?? resample; - - if (BROWSER_ENV) { - // TODO use `resample` in browser environment - - // Store number of channels before resizing - let numChannels = this.channels; - - // Create canvas object for this image - let canvas = this.toCanvas(); - - // Actually perform resizing using the canvas API - const ctx = createCanvasFunction(width, height).getContext('2d'); - - // Draw image to context, resizing in the process - ctx.drawImage(canvas, 0, 0, width, height); - - // Create image from the resized data - let resizedImage = new RawImage(ctx.getImageData(0, 0, width, height).data, width, height, 4); - - // Convert back so that image has the same number of channels as before - return resizedImage.convert(numChannels); - - } else { - // Create sharp image from raw data, and resize - let img = this.toSharp(); - - switch (resampleMethod) { - case 'box': - case 'hamming': - if (resampleMethod === 'box' || resampleMethod === 'hamming') { - console.warn(`Resampling method ${resampleMethod} is not yet supported. Using bilinear instead.`); - resampleMethod = 'bilinear'; - } - - case 'nearest': - case 'bilinear': - case 'bicubic': - // Perform resizing using affine transform. - // This matches how the python Pillow library does it. - img = img.affine([width / this.width, 0, 0, height / this.height], { - interpolator: resampleMethod - }); - break; - - case 'lanczos': - // https://github.com/python-pillow/Pillow/discussions/5519 - // https://github.com/lovell/sharp/blob/main/docs/api-resize.md - img = img.resize({ - width, height, - fit: 'fill', - kernel: 'lanczos3', // PIL Lanczos uses a kernel size of 3 - }); - break; - - default: - throw new Error(`Resampling method ${resampleMethod} is not supported.`); - } - - return await loadImageFunction(img); - } - - } - - async pad([left, right, top, bottom]) { - left = Math.max(left, 0); - right = Math.max(right, 0); - top = Math.max(top, 0); - bottom = Math.max(bottom, 0); - - if (left === 0 && right === 0 && top === 0 && bottom === 0) { - // No padding needed - return this; - } - - if (BROWSER_ENV) { - // Store number of channels before padding - let numChannels = this.channels; - - // Create canvas object for this image - let canvas = this.toCanvas(); - - let newWidth = this.width + left + right; - let newHeight = this.height + top + bottom; - - // Create a new canvas of the desired size. - const ctx = createCanvasFunction(newWidth, newHeight).getContext('2d'); - - // Draw image to context, padding in the process - ctx.drawImage(canvas, - 0, 0, this.width, this.height, - left, top, newWidth, newHeight + switch (resampleMethod) { + case "box": + case "hamming": + if (resampleMethod === "box" || resampleMethod === "hamming") { + console.warn( + `Resampling method ${resampleMethod} is not yet supported. Using bilinear instead.`, ); + resampleMethod = "bilinear"; + } - // Create image from the padded data - let paddedImage = new RawImage( - ctx.getImageData(0, 0, newWidth, newHeight).data, - newWidth, newHeight, 4); + case "nearest": + case "bilinear": + case "bicubic": + // Perform resizing using affine transform. + // This matches how the python Pillow library does it. + img = img.affine([width / this.width, 0, 0, height / this.height], { + interpolator: resampleMethod, + }); + break; - // Convert back so that image has the same number of channels as before - return paddedImage.convert(numChannels); + case "lanczos": + // https://github.com/python-pillow/Pillow/discussions/5519 + // https://github.com/lovell/sharp/blob/main/docs/api-resize.md + img = img.resize({ + width, + height, + fit: "fill", + kernel: "lanczos3", // PIL Lanczos uses a kernel size of 3 + }); + break; - } else { - let img = this.toSharp().extend({ left, right, top, bottom }); - return await loadImageFunction(img); - } + default: + throw new Error( + `Resampling method ${resampleMethod} is not supported.`, + ); + } + + return await loadImageFunction(img); + } + } + + async pad([left, right, top, bottom]) { + left = Math.max(left, 0); + right = Math.max(right, 0); + top = Math.max(top, 0); + bottom = Math.max(bottom, 0); + + if (left === 0 && right === 0 && top === 0 && bottom === 0) { + // No padding needed + return this; } - async crop([x_min, y_min, x_max, y_max]) { - // Ensure crop bounds are within the image - x_min = Math.max(x_min, 0); - y_min = Math.max(y_min, 0); - x_max = Math.min(x_max, this.width - 1); - y_max = Math.min(y_max, this.height - 1); + if (BROWSER_ENV) { + // Store number of channels before padding + let numChannels = this.channels; - // Do nothing if the crop is the entire image - if (x_min === 0 && y_min === 0 && x_max === this.width - 1 && y_max === this.height - 1) { - return this; - } + // Create canvas object for this image + let canvas = this.toCanvas(); - const crop_width = x_max - x_min + 1; - const crop_height = y_max - y_min + 1; + let newWidth = this.width + left + right; + let newHeight = this.height + top + bottom; - if (BROWSER_ENV) { - // Store number of channels before resizing - const numChannels = this.channels; + // Create a new canvas of the desired size. + const ctx = createCanvasFunction(newWidth, newHeight).getContext("2d"); - // Create canvas object for this image - const canvas = this.toCanvas(); + // Draw image to context, padding in the process + ctx.drawImage( + canvas, + 0, + 0, + this.width, + this.height, + left, + top, + newWidth, + newHeight, + ); - // Create a new canvas of the desired size. This is needed since if the - // image is too small, we need to pad it with black pixels. - const ctx = createCanvasFunction(crop_width, crop_height).getContext('2d'); + // Create image from the padded data + let paddedImage = new RawImage( + ctx.getImageData(0, 0, newWidth, newHeight).data, + newWidth, + newHeight, + 4, + ); - // Draw image to context, cropping in the process - ctx.drawImage(canvas, - x_min, y_min, crop_width, crop_height, - 0, 0, crop_width, crop_height - ); + // Convert back so that image has the same number of channels as before + return paddedImage.convert(numChannels); + } else { + let img = this.toSharp().extend({ left, right, top, bottom }); + return await loadImageFunction(img); + } + } - // Create image from the resized data - const resizedImage = new RawImage(ctx.getImageData(0, 0, crop_width, crop_height).data, crop_width, crop_height, 4); - - // Convert back so that image has the same number of channels as before - return resizedImage.convert(numChannels); - - } else { - // Create sharp image from raw data - const img = this.toSharp().extract({ - left: x_min, - top: y_min, - width: crop_width, - height: crop_height, - }); - - return await loadImageFunction(img); - } + async crop([x_min, y_min, x_max, y_max]) { + // Ensure crop bounds are within the image + x_min = Math.max(x_min, 0); + y_min = Math.max(y_min, 0); + x_max = Math.min(x_max, this.width - 1); + y_max = Math.min(y_max, this.height - 1); + // Do nothing if the crop is the entire image + if ( + x_min === 0 && + y_min === 0 && + x_max === this.width - 1 && + y_max === this.height - 1 + ) { + return this; } - async center_crop(crop_width, crop_height) { - // If the image is already the desired size, return it - if (this.width === crop_width && this.height === crop_height) { - return this; - } + const crop_width = x_max - x_min + 1; + const crop_height = y_max - y_min + 1; - // Determine bounds of the image in the new canvas - let width_offset = (this.width - crop_width) / 2; - let height_offset = (this.height - crop_height) / 2; + if (BROWSER_ENV) { + // Store number of channels before resizing + const numChannels = this.channels; + // Create canvas object for this image + const canvas = this.toCanvas(); - if (BROWSER_ENV) { - // Store number of channels before resizing - let numChannels = this.channels; + // Create a new canvas of the desired size. This is needed since if the + // image is too small, we need to pad it with black pixels. + const ctx = createCanvasFunction(crop_width, crop_height).getContext( + "2d", + ); - // Create canvas object for this image - let canvas = this.toCanvas(); + // Draw image to context, cropping in the process + ctx.drawImage( + canvas, + x_min, + y_min, + crop_width, + crop_height, + 0, + 0, + crop_width, + crop_height, + ); - // Create a new canvas of the desired size. This is needed since if the - // image is too small, we need to pad it with black pixels. - const ctx = createCanvasFunction(crop_width, crop_height).getContext('2d'); + // Create image from the resized data + const resizedImage = new RawImage( + ctx.getImageData(0, 0, crop_width, crop_height).data, + crop_width, + crop_height, + 4, + ); - let sourceX = 0; - let sourceY = 0; - let destX = 0; - let destY = 0; + // Convert back so that image has the same number of channels as before + return resizedImage.convert(numChannels); + } else { + // Create sharp image from raw data + const img = this.toSharp().extract({ + left: x_min, + top: y_min, + width: crop_width, + height: crop_height, + }); - if (width_offset >= 0) { - sourceX = width_offset; - } else { - destX = -width_offset; - } + return await loadImageFunction(img); + } + } - if (height_offset >= 0) { - sourceY = height_offset; - } else { - destY = -height_offset; - } - - // Draw image to context, cropping in the process - ctx.drawImage(canvas, - sourceX, sourceY, crop_width, crop_height, - destX, destY, crop_width, crop_height - ); - - // Create image from the resized data - let resizedImage = new RawImage(ctx.getImageData(0, 0, crop_width, crop_height).data, crop_width, crop_height, 4); - - // Convert back so that image has the same number of channels as before - return resizedImage.convert(numChannels); - - } else { - // Create sharp image from raw data - let img = this.toSharp(); - - if (width_offset >= 0 && height_offset >= 0) { - // Cropped image lies entirely within the original image - img = img.extract({ - left: Math.floor(width_offset), - top: Math.floor(height_offset), - width: crop_width, - height: crop_height, - }) - } else if (width_offset <= 0 && height_offset <= 0) { - // Cropped image lies entirely outside the original image, - // so we add padding - let top = Math.floor(-height_offset); - let left = Math.floor(-width_offset); - img = img.extend({ - top: top, - left: left, - - // Ensures the resulting image has the desired dimensions - right: crop_width - this.width - left, - bottom: crop_height - this.height - top, - }); - } else { - // Cropped image lies partially outside the original image. - // We first pad, then crop. - - let y_padding = [0, 0]; - let y_extract = 0; - if (height_offset < 0) { - y_padding[0] = Math.floor(-height_offset); - y_padding[1] = crop_height - this.height - y_padding[0]; - } else { - y_extract = Math.floor(height_offset); - } - - let x_padding = [0, 0]; - let x_extract = 0; - if (width_offset < 0) { - x_padding[0] = Math.floor(-width_offset); - x_padding[1] = crop_width - this.width - x_padding[0]; - } else { - x_extract = Math.floor(width_offset); - } - - img = img.extend({ - top: y_padding[0], - bottom: y_padding[1], - left: x_padding[0], - right: x_padding[1], - }).extract({ - left: x_extract, - top: y_extract, - width: crop_width, - height: crop_height, - }) - } - - return await loadImageFunction(img); - } + async center_crop(crop_width, crop_height) { + // If the image is already the desired size, return it + if (this.width === crop_width && this.height === crop_height) { + return this; } - async toBlob(type = 'image/png', quality = 1) { - if (!BROWSER_ENV) { - throw new Error('toBlob() is only supported in browser environments.') - } + // Determine bounds of the image in the new canvas + let width_offset = (this.width - crop_width) / 2; + let height_offset = (this.height - crop_height) / 2; - const canvas = this.toCanvas(); - return await canvas.convertToBlob({ type, quality }); - } + if (BROWSER_ENV) { + // Store number of channels before resizing + let numChannels = this.channels; - toCanvas() { - if (!BROWSER_ENV) { - throw new Error('toCanvas() is only supported in browser environments.') - } + // Create canvas object for this image + let canvas = this.toCanvas(); - // Clone, and convert data to RGBA before drawing to canvas. - // This is because the canvas API only supports RGBA - let cloned = this.clone().rgba(); + // Create a new canvas of the desired size. This is needed since if the + // image is too small, we need to pad it with black pixels. + const ctx = createCanvasFunction(crop_width, crop_height).getContext( + "2d", + ); - // Create canvas object for the cloned image - let clonedCanvas = createCanvasFunction(cloned.width, cloned.height); + let sourceX = 0; + let sourceY = 0; + let destX = 0; + let destY = 0; - // Draw image to context - let data = new ImageDataClass(cloned.data, cloned.width, cloned.height); - clonedCanvas.getContext('2d').putImageData(data, 0, 0); + if (width_offset >= 0) { + sourceX = width_offset; + } else { + destX = -width_offset; + } - return clonedCanvas; - } + if (height_offset >= 0) { + sourceY = height_offset; + } else { + destY = -height_offset; + } - /** - * Helper method to update the image data. - * @param {Uint8ClampedArray} data The new image data. - * @param {number} width The new width of the image. - * @param {number} height The new height of the image. - * @param {1|2|3|4|null} [channels] The new number of channels of the image. - * @private - */ - _update(data, width, height, channels = null) { - this.data = data; - this.width = width; - this.height = height; - if (channels !== null) { - this.channels = channels; - } - return this; - } + // Draw image to context, cropping in the process + ctx.drawImage( + canvas, + sourceX, + sourceY, + crop_width, + crop_height, + destX, + destY, + crop_width, + crop_height, + ); - /** - * Clone the image - * @returns {RawImage} The cloned image - */ - clone() { - return new RawImage(this.data.slice(), this.width, this.height, this.channels); - } + // Create image from the resized data + let resizedImage = new RawImage( + ctx.getImageData(0, 0, crop_width, crop_height).data, + crop_width, + crop_height, + 4, + ); - /** - * Helper method for converting image to have a certain number of channels - * @param {number} numChannels The number of channels. Must be 1, 3, or 4. - * @returns {RawImage} `this` to support chaining. - */ - convert(numChannels) { - if (this.channels === numChannels) return this; // Already correct number of channels + // Convert back so that image has the same number of channels as before + return resizedImage.convert(numChannels); + } else { + // Create sharp image from raw data + let img = this.toSharp(); - switch (numChannels) { - case 1: - this.grayscale(); - break; - case 3: - this.rgb(); - break; - case 4: - this.rgba(); - break; - default: - throw new Error(`Conversion failed due to unsupported number of channels: ${this.channels}`); - } - return this; - } - - /** - * Save the image to the given path. - * @param {string} path The path to save the image to. - */ - async save(path) { - - if (BROWSER_ENV) { - if (WEBWORKER_ENV) { - throw new Error('Unable to save an image from a Web Worker.') - } - - const extension = path.split('.').pop().toLowerCase(); - const mime = CONTENT_TYPE_MAP.get(extension) ?? 'image/png'; - - // Convert image to Blob - const blob = await this.toBlob(mime); - - // Convert the canvas content to a data URL - const dataURL = URL.createObjectURL(blob); - - // Create an anchor element with the data URL as the href attribute - const downloadLink = document.createElement('a'); - downloadLink.href = dataURL; - - // Set the download attribute to specify the desired filename for the downloaded image - downloadLink.download = path; - - // Trigger the download - downloadLink.click(); - - // Clean up: remove the anchor element from the DOM - downloadLink.remove(); - - } else if (!env.useFS) { - throw new Error('Unable to save the image because filesystem is disabled in this environment.') - - } else { - const img = this.toSharp(); - return await img.toFile(path); - } - } - - toSharp() { - if (BROWSER_ENV) { - throw new Error('toSharp() is only supported in server-side environments.') - } - - return sharp(this.data, { - raw: { - width: this.width, - height: this.height, - channels: this.channels - } + if (width_offset >= 0 && height_offset >= 0) { + // Cropped image lies entirely within the original image + img = img.extract({ + left: Math.floor(width_offset), + top: Math.floor(height_offset), + width: crop_width, + height: crop_height, }); + } else if (width_offset <= 0 && height_offset <= 0) { + // Cropped image lies entirely outside the original image, + // so we add padding + let top = Math.floor(-height_offset); + let left = Math.floor(-width_offset); + img = img.extend({ + top: top, + left: left, + + // Ensures the resulting image has the desired dimensions + right: crop_width - this.width - left, + bottom: crop_height - this.height - top, + }); + } else { + // Cropped image lies partially outside the original image. + // We first pad, then crop. + + let y_padding = [0, 0]; + let y_extract = 0; + if (height_offset < 0) { + y_padding[0] = Math.floor(-height_offset); + y_padding[1] = crop_height - this.height - y_padding[0]; + } else { + y_extract = Math.floor(height_offset); + } + + let x_padding = [0, 0]; + let x_extract = 0; + if (width_offset < 0) { + x_padding[0] = Math.floor(-width_offset); + x_padding[1] = crop_width - this.width - x_padding[0]; + } else { + x_extract = Math.floor(width_offset); + } + + img = img + .extend({ + top: y_padding[0], + bottom: y_padding[1], + left: x_padding[0], + right: x_padding[1], + }) + .extract({ + left: x_extract, + top: y_extract, + width: crop_width, + height: crop_height, + }); + } + + return await loadImageFunction(img); } + } + + async toBlob(type = "image/png", quality = 1) { + if (!BROWSER_ENV) { + throw new Error("toBlob() is only supported in browser environments."); + } + + const canvas = this.toCanvas(); + return await canvas.convertToBlob({ type, quality }); + } + + toCanvas() { + if (!BROWSER_ENV) { + throw new Error("toCanvas() is only supported in browser environments."); + } + + // Clone, and convert data to RGBA before drawing to canvas. + // This is because the canvas API only supports RGBA + let cloned = this.clone().rgba(); + + // Create canvas object for the cloned image + let clonedCanvas = createCanvasFunction(cloned.width, cloned.height); + + // Draw image to context + let data = new ImageDataClass(cloned.data, cloned.width, cloned.height); + clonedCanvas.getContext("2d").putImageData(data, 0, 0); + + return clonedCanvas; + } + + /** + * Helper method to update the image data. + * @param {Uint8ClampedArray} data The new image data. + * @param {number} width The new width of the image. + * @param {number} height The new height of the image. + * @param {1|2|3|4|null} [channels] The new number of channels of the image. + * @private + */ + _update(data, width, height, channels = null) { + this.data = data; + this.width = width; + this.height = height; + if (channels !== null) { + this.channels = channels; + } + return this; + } + + /** + * Clone the image + * @returns {RawImage} The cloned image + */ + clone() { + return new RawImage( + this.data.slice(), + this.width, + this.height, + this.channels, + ); + } + + /** + * Helper method for converting image to have a certain number of channels + * @param {number} numChannels The number of channels. Must be 1, 3, or 4. + * @returns {RawImage} `this` to support chaining. + */ + convert(numChannels) { + if (this.channels === numChannels) return this; // Already correct number of channels + + switch (numChannels) { + case 1: + this.grayscale(); + break; + case 3: + this.rgb(); + break; + case 4: + this.rgba(); + break; + default: + throw new Error( + `Conversion failed due to unsupported number of channels: ${this.channels}`, + ); + } + return this; + } + + /** + * Save the image to the given path. + * @param {string} path The path to save the image to. + */ + async save(path) { + if (BROWSER_ENV) { + if (WEBWORKER_ENV) { + throw new Error("Unable to save an image from a Web Worker."); + } + + const extension = path.split(".").pop().toLowerCase(); + const mime = CONTENT_TYPE_MAP.get(extension) ?? "image/png"; + + // Convert image to Blob + const blob = await this.toBlob(mime); + + // Convert the canvas content to a data URL + const dataURL = URL.createObjectURL(blob); + + // Create an anchor element with the data URL as the href attribute + const downloadLink = document.createElement("a"); + downloadLink.href = dataURL; + + // Set the download attribute to specify the desired filename for the downloaded image + downloadLink.download = path; + + // Trigger the download + downloadLink.click(); + + // Clean up: remove the anchor element from the DOM + downloadLink.remove(); + } else if (!env.useFS) { + throw new Error( + "Unable to save the image because filesystem is disabled in this environment.", + ); + } else { + const img = this.toSharp(); + return await img.toFile(path); + } + } + + toSharp() { + if (BROWSER_ENV) { + throw new Error( + "toSharp() is only supported in server-side environments.", + ); + } + + return sharp(this.data, { + raw: { + width: this.width, + height: this.height, + channels: this.channels, + }, + }); + } } diff --git a/core/vendor/modules/@xenova/transformers/src/utils/maths.js b/core/vendor/modules/@xenova/transformers/src/utils/maths.js index 216def07e..a3b523bcf 100644 --- a/core/vendor/modules/@xenova/transformers/src/utils/maths.js +++ b/core/vendor/modules/@xenova/transformers/src/utils/maths.js @@ -1,10 +1,9 @@ - /** - * @file Helper module for mathematical processing. - * - * These functions and classes are only used internally, + * @file Helper module for mathematical processing. + * + * These functions and classes are only used internally, * meaning an end-user shouldn't need to access anything here. - * + * * @module utils/maths */ @@ -17,117 +16,120 @@ /** * @param {TypedArray} input */ -export function interpolate_data(input, [in_channels, in_height, in_width], [out_height, out_width], mode = 'bilinear', align_corners = false) { - // TODO use mode and align_corners +export function interpolate_data( + input, + [in_channels, in_height, in_width], + [out_height, out_width], + mode = "bilinear", + align_corners = false, +) { + // TODO use mode and align_corners - // Output image dimensions - const x_scale = out_width / in_width; - const y_scale = out_height / in_height; + // Output image dimensions + const x_scale = out_width / in_width; + const y_scale = out_height / in_height; - // Output image - // @ts-ignore - const out_img = new input.constructor(out_height * out_width * in_channels); + // Output image + // @ts-ignore + const out_img = new input.constructor(out_height * out_width * in_channels); - // Pre-calculate strides - const inStride = in_height * in_width; - const outStride = out_height * out_width; + // Pre-calculate strides + const inStride = in_height * in_width; + const outStride = out_height * out_width; - for (let i = 0; i < out_height; ++i) { - for (let j = 0; j < out_width; ++j) { - // Calculate output offset - const outOffset = i * out_width + j; + for (let i = 0; i < out_height; ++i) { + for (let j = 0; j < out_width; ++j) { + // Calculate output offset + const outOffset = i * out_width + j; - // Calculate input pixel coordinates - const x = (j + 0.5) / x_scale - 0.5; - const y = (i + 0.5) / y_scale - 0.5; + // Calculate input pixel coordinates + const x = (j + 0.5) / x_scale - 0.5; + const y = (i + 0.5) / y_scale - 0.5; - // Calculate the four nearest input pixels - // We also check if the input pixel coordinates are within the image bounds - let x1 = Math.floor(x); - let y1 = Math.floor(y); - const x2 = Math.min(x1 + 1, in_width - 1); - const y2 = Math.min(y1 + 1, in_height - 1); + // Calculate the four nearest input pixels + // We also check if the input pixel coordinates are within the image bounds + let x1 = Math.floor(x); + let y1 = Math.floor(y); + const x2 = Math.min(x1 + 1, in_width - 1); + const y2 = Math.min(y1 + 1, in_height - 1); - x1 = Math.max(x1, 0); - y1 = Math.max(y1, 0); + x1 = Math.max(x1, 0); + y1 = Math.max(y1, 0); + // Calculate the fractional distances between the input pixel and the four nearest pixels + const s = x - x1; + const t = y - y1; - // Calculate the fractional distances between the input pixel and the four nearest pixels - const s = x - x1; - const t = y - y1; + // Perform bilinear interpolation + const w1 = (1 - s) * (1 - t); + const w2 = s * (1 - t); + const w3 = (1 - s) * t; + const w4 = s * t; - // Perform bilinear interpolation - const w1 = (1 - s) * (1 - t); - const w2 = s * (1 - t); - const w3 = (1 - s) * t; - const w4 = s * t; + // Calculate the four nearest input pixel indices + const yStride = y1 * in_width; + const xStride = y2 * in_width; + const idx1 = yStride + x1; + const idx2 = yStride + x2; + const idx3 = xStride + x1; + const idx4 = xStride + x2; - // Calculate the four nearest input pixel indices - const yStride = y1 * in_width; - const xStride = y2 * in_width; - const idx1 = yStride + x1; - const idx2 = yStride + x2; - const idx3 = xStride + x1; - const idx4 = xStride + x2; + for (let k = 0; k < in_channels; ++k) { + // Calculate channel offset + const cOffset = k * inStride; - for (let k = 0; k < in_channels; ++k) { - // Calculate channel offset - const cOffset = k * inStride; - - out_img[k * outStride + outOffset] = - w1 * input[cOffset + idx1] + - w2 * input[cOffset + idx2] + - w3 * input[cOffset + idx3] + - w4 * input[cOffset + idx4]; - } - } + out_img[k * outStride + outOffset] = + w1 * input[cOffset + idx1] + + w2 * input[cOffset + idx2] + + w3 * input[cOffset + idx3] + + w4 * input[cOffset + idx4]; + } } + } - return out_img; + return out_img; } - /** * Helper method to transpose a `AnyTypedArray` directly - * @template {AnyTypedArray} T - * @param {T} array - * @param {number[]} dims - * @param {number[]} axes + * @template {AnyTypedArray} T + * @param {T} array + * @param {number[]} dims + * @param {number[]} axes * @returns {[T, number[]]} The transposed array and the new shape. */ export function transpose_data(array, dims, axes) { - // Calculate the new shape of the transposed array - // and the stride of the original array - const shape = new Array(axes.length); - const stride = new Array(axes.length); + // Calculate the new shape of the transposed array + // and the stride of the original array + const shape = new Array(axes.length); + const stride = new Array(axes.length); - for (let i = axes.length - 1, s = 1; i >= 0; --i) { - stride[i] = s; - shape[i] = dims[axes[i]]; - s *= shape[i]; + for (let i = axes.length - 1, s = 1; i >= 0; --i) { + stride[i] = s; + shape[i] = dims[axes[i]]; + s *= shape[i]; + } + + // Precompute inverse mapping of stride + const invStride = axes.map((_, i) => stride[axes.indexOf(i)]); + + // Create the transposed array with the new shape + // @ts-ignore + const transposedData = new array.constructor(array.length); + + // Transpose the original array to the new array + for (let i = 0; i < array.length; ++i) { + let newIndex = 0; + for (let j = dims.length - 1, k = i; j >= 0; --j) { + newIndex += (k % dims[j]) * invStride[j]; + k = Math.floor(k / dims[j]); } + transposedData[newIndex] = array[i]; + } - // Precompute inverse mapping of stride - const invStride = axes.map((_, i) => stride[axes.indexOf(i)]); - - // Create the transposed array with the new shape - // @ts-ignore - const transposedData = new array.constructor(array.length); - - // Transpose the original array to the new array - for (let i = 0; i < array.length; ++i) { - let newIndex = 0; - for (let j = dims.length - 1, k = i; j >= 0; --j) { - newIndex += (k % dims[j]) * invStride[j]; - k = Math.floor(k / dims[j]); - } - transposedData[newIndex] = array[i]; - } - - return [transposedData, shape]; + return [transposedData, shape]; } - /** * Compute the softmax of an array of numbers. * @template {TypedArray|number[]} T @@ -135,20 +137,20 @@ export function transpose_data(array, dims, axes) { * @returns {T} The softmax array. */ export function softmax(arr) { - // Compute the maximum value in the array - const maxVal = max(arr)[0]; + // Compute the maximum value in the array + const maxVal = max(arr)[0]; - // Compute the exponentials of the array values - const exps = arr.map(x => Math.exp(x - maxVal)); + // Compute the exponentials of the array values + const exps = arr.map((x) => Math.exp(x - maxVal)); - // Compute the sum of the exponentials - // @ts-ignore - const sumExps = exps.reduce((acc, val) => acc + val, 0); + // Compute the sum of the exponentials + // @ts-ignore + const sumExps = exps.reduce((acc, val) => acc + val, 0); - // Compute the softmax values - const softmaxArr = exps.map(x => x / sumExps); + // Compute the softmax values + const softmaxArr = exps.map((x) => x / sumExps); - return /** @type {T} */(softmaxArr); + return /** @type {T} */ (softmaxArr); } /** @@ -158,13 +160,13 @@ export function softmax(arr) { * @returns {T} The resulting log_softmax array. */ export function log_softmax(arr) { - // Compute the softmax values - const softmaxArr = softmax(arr); + // Compute the softmax values + const softmaxArr = softmax(arr); - // Apply log formula to each element - const logSoftmaxArr = softmaxArr.map(x => Math.log(x)); + // Apply log formula to each element + const logSoftmaxArr = softmaxArr.map((x) => Math.log(x)); - return /** @type {T} */(logSoftmaxArr); + return /** @type {T} */ (logSoftmaxArr); } /** @@ -174,10 +176,9 @@ export function log_softmax(arr) { * @returns {number} The dot product of arr1 and arr2. */ export function dot(arr1, arr2) { - return arr1.reduce((acc, val, i) => acc + val * arr2[i], 0); + return arr1.reduce((acc, val, i) => acc + val * arr2[i], 0); } - /** * Get the top k items from an iterable, sorted by descending order * @param {any[]|TypedArray} items The items to be sorted @@ -185,17 +186,17 @@ export function dot(arr1, arr2) { * @returns {[number, any][]} The top k items, sorted by descending order */ export function getTopItems(items, top_k = 0) { - // if top == 0, return all + // if top == 0, return all - items = Array.from(items) - .map((x, i) => [i, x]) // Get indices ([index, score]) - .sort((a, b) => b[1] - a[1]) // Sort by log probabilities + items = Array.from(items) + .map((x, i) => [i, x]) // Get indices ([index, score]) + .sort((a, b) => b[1] - a[1]); // Sort by log probabilities - if (top_k !== null && top_k > 0) { - items = items.slice(0, top_k); // Get top k items - } + if (top_k !== null && top_k > 0) { + items = items.slice(0, top_k); // Get top k items + } - return items + return items; } /** @@ -206,19 +207,19 @@ export function getTopItems(items, top_k = 0) { * @returns {number} The cosine similarity between the two arrays. */ export function cos_sim(arr1, arr2) { - // Calculate dot product of the two arrays - const dotProduct = dot(arr1, arr2); + // Calculate dot product of the two arrays + const dotProduct = dot(arr1, arr2); - // Calculate the magnitude of the first array - const magnitudeA = magnitude(arr1); + // Calculate the magnitude of the first array + const magnitudeA = magnitude(arr1); - // Calculate the magnitude of the second array - const magnitudeB = magnitude(arr2); + // Calculate the magnitude of the second array + const magnitudeB = magnitude(arr2); - // Calculate the cosine similarity - const cosineSimilarity = dotProduct / (magnitudeA * magnitudeB); + // Calculate the cosine similarity + const cosineSimilarity = dotProduct / (magnitudeA * magnitudeB); - return cosineSimilarity; + return cosineSimilarity; } /** @@ -227,10 +228,9 @@ export function cos_sim(arr1, arr2) { * @returns {number} The magnitude of the array. */ export function magnitude(arr) { - return Math.sqrt(arr.reduce((acc, val) => acc + val * val, 0)); + return Math.sqrt(arr.reduce((acc, val) => acc + val * val, 0)); } - /** * Returns the value and index of the minimum element in an array. * @param {number[]|TypedArray} arr array of numbers. @@ -238,19 +238,18 @@ export function magnitude(arr) { * @throws {Error} If array is empty. */ export function min(arr) { - if (arr.length === 0) throw Error('Array must not be empty'); - let min = arr[0]; - let indexOfMin = 0; - for (let i = 1; i < arr.length; ++i) { - if (arr[i] < min) { - min = arr[i]; - indexOfMin = i; - } + if (arr.length === 0) throw Error("Array must not be empty"); + let min = arr[0]; + let indexOfMin = 0; + for (let i = 1; i < arr.length; ++i) { + if (arr[i] < min) { + min = arr[i]; + indexOfMin = i; } - return [min, indexOfMin]; + } + return [min, indexOfMin]; } - /** * Returns the value and index of the maximum element in an array. * @param {number[]|AnyTypedArray} arr array of numbers. @@ -258,688 +257,689 @@ export function min(arr) { * @throws {Error} If array is empty. */ export function max(arr) { - if (arr.length === 0) throw Error('Array must not be empty'); - let max = arr[0]; - let indexOfMax = 0; - for (let i = 1; i < arr.length; ++i) { - if (arr[i] > max) { - max = arr[i]; - indexOfMax = i; - } + if (arr.length === 0) throw Error("Array must not be empty"); + let max = arr[0]; + let indexOfMax = 0; + for (let i = 1; i < arr.length; ++i) { + if (arr[i] > max) { + max = arr[i]; + indexOfMax = i; } - return [Number(max), indexOfMax]; + } + return [Number(max), indexOfMax]; } function isPowerOfTwo(number) { - // Check if the number is greater than 0 and has only one bit set to 1 - return (number > 0) && ((number & (number - 1)) === 0); + // Check if the number is greater than 0 and has only one bit set to 1 + return number > 0 && (number & (number - 1)) === 0; } /** * Implementation of Radix-4 FFT. - * + * * P2FFT class provides functionality for performing Fast Fourier Transform on arrays * which are a power of two in length. * Code adapted from https://www.npmjs.com/package/fft.js */ class P2FFT { - /** - * @param {number} size The size of the input array. Must be a power of two larger than 1. - * @throws {Error} FFT size must be a power of two larger than 1. - */ - constructor(size) { - this.size = size | 0; // convert to a 32-bit signed integer - if (this.size <= 1 || !isPowerOfTwo(this.size)) - throw new Error('FFT size must be a power of two larger than 1'); + /** + * @param {number} size The size of the input array. Must be a power of two larger than 1. + * @throws {Error} FFT size must be a power of two larger than 1. + */ + constructor(size) { + this.size = size | 0; // convert to a 32-bit signed integer + if (this.size <= 1 || !isPowerOfTwo(this.size)) + throw new Error("FFT size must be a power of two larger than 1"); - this._csize = size << 1; + this._csize = size << 1; - this.table = new Float64Array(this.size * 2); - for (let i = 0; i < this.table.length; i += 2) { - const angle = Math.PI * i / this.size; - this.table[i] = Math.cos(angle); - this.table[i + 1] = -Math.sin(angle); + this.table = new Float64Array(this.size * 2); + for (let i = 0; i < this.table.length; i += 2) { + const angle = (Math.PI * i) / this.size; + this.table[i] = Math.cos(angle); + this.table[i + 1] = -Math.sin(angle); + } + + // Find size's power of two + let power = 0; + for (let t = 1; this.size > t; t <<= 1) ++power; + + // Calculate initial step's width: + // * If we are full radix-4, it is 2x smaller to give inital len=8 + // * Otherwise it is the same as `power` to give len=4 + this._width = power % 2 === 0 ? power - 1 : power; + + // Pre-compute bit-reversal patterns + this._bitrev = new Int32Array(1 << this._width); + for (let j = 0; j < this._bitrev.length; ++j) { + this._bitrev[j] = 0; + for (let shift = 0; shift < this._width; shift += 2) { + const revShift = this._width - shift - 2; + this._bitrev[j] |= ((j >>> shift) & 3) << revShift; + } + } + } + + /** + * Create a complex number array with size `2 * size` + * + * @returns {Float64Array} A complex number array with size `2 * size` + */ + createComplexArray() { + return new Float64Array(this._csize); + } + + /** + * Converts a complex number representation stored in a Float64Array to an array of real numbers. + * + * @param {Float64Array} complex The complex number representation to be converted. + * @param {number[]} [storage] An optional array to store the result in. + * @returns {number[]} An array of real numbers representing the input complex number representation. + */ + fromComplexArray(complex, storage) { + const res = storage || new Array(complex.length >>> 1); + for (let i = 0; i < complex.length; i += 2) res[i >>> 1] = complex[i]; + return res; + } + + /** + * Convert a real-valued input array to a complex-valued output array. + * @param {Float64Array} input The real-valued input array. + * @param {Float64Array} [storage] Optional buffer to store the output array. + * @returns {Float64Array} The complex-valued output array. + */ + toComplexArray(input, storage) { + const res = storage || this.createComplexArray(); + for (let i = 0; i < res.length; i += 2) { + res[i] = input[i >>> 1]; + res[i + 1] = 0; + } + return res; + } + + /** + * Completes the spectrum by adding its mirrored negative frequency components. + * @param {Float64Array} spectrum The input spectrum. + * @returns {void} + */ + completeSpectrum(spectrum) { + const size = this._csize; + const half = size >>> 1; + for (let i = 2; i < half; i += 2) { + spectrum[size - i] = spectrum[i]; + spectrum[size - i + 1] = -spectrum[i + 1]; + } + } + + /** + * Performs a Fast Fourier Transform (FFT) on the given input data and stores the result in the output buffer. + * + * @param {Float64Array} out The output buffer to store the result. + * @param {Float64Array} data The input data to transform. + * + * @throws {Error} Input and output buffers must be different. + * + * @returns {void} + */ + transform(out, data) { + if (out === data) + throw new Error("Input and output buffers must be different"); + + this._transform4(out, data, 1 /* DONE */); + } + + /** + * Performs a real-valued forward FFT on the given input buffer and stores the result in the given output buffer. + * The input buffer must contain real values only, while the output buffer will contain complex values. The input and + * output buffers must be different. + * + * @param {Float64Array} out The output buffer. + * @param {Float64Array} data The input buffer containing real values. + * + * @throws {Error} If the input and output buffers are the same. + */ + realTransform(out, data) { + if (out === data) + throw new Error("Input and output buffers must be different"); + + this._realTransform4(out, data, 1 /* DONE */); + } + + /** + * Performs an inverse FFT transformation on the given `data` array, and stores the result in `out`. + * The `out` array must be a different buffer than the `data` array. The `out` array will contain the + * result of the transformation. The `data` array will not be modified. + * + * @param {Float64Array} out The output buffer for the transformed data. + * @param {Float64Array} data The input data to transform. + * @throws {Error} If `out` and `data` refer to the same buffer. + * @returns {void} + */ + inverseTransform(out, data) { + if (out === data) + throw new Error("Input and output buffers must be different"); + + this._transform4(out, data, -1 /* DONE */); + for (let i = 0; i < out.length; ++i) out[i] /= this.size; + } + + /** + * Performs a radix-4 implementation of a discrete Fourier transform on a given set of data. + * + * @param {Float64Array} out The output buffer for the transformed data. + * @param {Float64Array} data The input buffer of data to be transformed. + * @param {number} inv A scaling factor to apply to the transform. + * @returns {void} + */ + _transform4(out, data, inv) { + // radix-4 implementation + + const size = this._csize; + + // Initial step (permute and transform) + const width = this._width; + let step = 1 << width; + let len = (size / step) << 1; + + let outOff; + let t; + const bitrev = this._bitrev; + if (len === 4) { + for (outOff = 0, t = 0; outOff < size; outOff += len, ++t) { + const off = bitrev[t]; + this._singleTransform2(data, out, outOff, off, step); + } + } else { + // len === 8 + for (outOff = 0, t = 0; outOff < size; outOff += len, ++t) { + const off = bitrev[t]; + this._singleTransform4(data, out, outOff, off, step, inv); + } + } + + // Loop through steps in decreasing order + for (step >>= 2; step >= 2; step >>= 2) { + len = (size / step) << 1; + const quarterLen = len >>> 2; + + // Loop through offsets in the data + for (outOff = 0; outOff < size; outOff += len) { + // Full case + const limit = outOff + quarterLen - 1; + for (let i = outOff, k = 0; i < limit; i += 2, k += step) { + const A = i; + const B = A + quarterLen; + const C = B + quarterLen; + const D = C + quarterLen; + + // Original values + const Ar = out[A]; + const Ai = out[A + 1]; + const Br = out[B]; + const Bi = out[B + 1]; + const Cr = out[C]; + const Ci = out[C + 1]; + const Dr = out[D]; + const Di = out[D + 1]; + + const tableBr = this.table[k]; + const tableBi = inv * this.table[k + 1]; + const MBr = Br * tableBr - Bi * tableBi; + const MBi = Br * tableBi + Bi * tableBr; + + const tableCr = this.table[2 * k]; + const tableCi = inv * this.table[2 * k + 1]; + const MCr = Cr * tableCr - Ci * tableCi; + const MCi = Cr * tableCi + Ci * tableCr; + + const tableDr = this.table[3 * k]; + const tableDi = inv * this.table[3 * k + 1]; + const MDr = Dr * tableDr - Di * tableDi; + const MDi = Dr * tableDi + Di * tableDr; + + // Pre-Final values + const T0r = Ar + MCr; + const T0i = Ai + MCi; + const T1r = Ar - MCr; + const T1i = Ai - MCi; + const T2r = MBr + MDr; + const T2i = MBi + MDi; + const T3r = inv * (MBr - MDr); + const T3i = inv * (MBi - MDi); + + // Final values + out[A] = T0r + T2r; + out[A + 1] = T0i + T2i; + out[B] = T1r + T3i; + out[B + 1] = T1i - T3r; + out[C] = T0r - T2r; + out[C + 1] = T0i - T2i; + out[D] = T1r - T3i; + out[D + 1] = T1i + T3r; } + } + } + } - // Find size's power of two - let power = 0; - for (let t = 1; this.size > t; t <<= 1) - ++power; + /** + * Performs a radix-2 implementation of a discrete Fourier transform on a given set of data. + * + * @param {Float64Array} data The input buffer of data to be transformed. + * @param {Float64Array} out The output buffer for the transformed data. + * @param {number} outOff The offset at which to write the output data. + * @param {number} off The offset at which to begin reading the input data. + * @param {number} step The step size for indexing the input data. + * @returns {void} + */ + _singleTransform2(data, out, outOff, off, step) { + // radix-2 implementation + // NOTE: Only called for len=4 - // Calculate initial step's width: - // * If we are full radix-4, it is 2x smaller to give inital len=8 - // * Otherwise it is the same as `power` to give len=4 - this._width = power % 2 === 0 ? power - 1 : power; + const evenR = data[off]; + const evenI = data[off + 1]; + const oddR = data[off + step]; + const oddI = data[off + step + 1]; - // Pre-compute bit-reversal patterns - this._bitrev = new Int32Array(1 << this._width); - for (let j = 0; j < this._bitrev.length; ++j) { - this._bitrev[j] = 0; - for (let shift = 0; shift < this._width; shift += 2) { - const revShift = this._width - shift - 2; - this._bitrev[j] |= ((j >>> shift) & 3) << revShift; - } + out[outOff] = evenR + oddR; + out[outOff + 1] = evenI + oddI; + out[outOff + 2] = evenR - oddR; + out[outOff + 3] = evenI - oddI; + } + + /** + * Performs radix-4 transformation on input data of length 8 + * + * @param {Float64Array} data Input data array of length 8 + * @param {Float64Array} out Output data array of length 8 + * @param {number} outOff Index of output array to start writing from + * @param {number} off Index of input array to start reading from + * @param {number} step Step size between elements in input array + * @param {number} inv Scaling factor for inverse transform + * + * @returns {void} + */ + _singleTransform4(data, out, outOff, off, step, inv) { + // radix-4 + // NOTE: Only called for len=8 + const step2 = step * 2; + const step3 = step * 3; + + // Original values + const Ar = data[off]; + const Ai = data[off + 1]; + const Br = data[off + step]; + const Bi = data[off + step + 1]; + const Cr = data[off + step2]; + const Ci = data[off + step2 + 1]; + const Dr = data[off + step3]; + const Di = data[off + step3 + 1]; + + // Pre-Final values + const T0r = Ar + Cr; + const T0i = Ai + Ci; + const T1r = Ar - Cr; + const T1i = Ai - Ci; + const T2r = Br + Dr; + const T2i = Bi + Di; + const T3r = inv * (Br - Dr); + const T3i = inv * (Bi - Di); + + // Final values + out[outOff] = T0r + T2r; + out[outOff + 1] = T0i + T2i; + out[outOff + 2] = T1r + T3i; + out[outOff + 3] = T1i - T3r; + out[outOff + 4] = T0r - T2r; + out[outOff + 5] = T0i - T2i; + out[outOff + 6] = T1r - T3i; + out[outOff + 7] = T1i + T3r; + } + + /** + * Real input radix-4 implementation + * @param {Float64Array} out Output array for the transformed data + * @param {Float64Array} data Input array of real data to be transformed + * @param {number} inv The scale factor used to normalize the inverse transform + */ + _realTransform4(out, data, inv) { + // Real input radix-4 implementation + const size = this._csize; + + // Initial step (permute and transform) + const width = this._width; + let step = 1 << width; + let len = (size / step) << 1; + + let outOff; + let t; + const bitrev = this._bitrev; + if (len === 4) { + for (outOff = 0, t = 0; outOff < size; outOff += len, ++t) { + const off = bitrev[t]; + this._singleRealTransform2(data, out, outOff, off >>> 1, step >>> 1); + } + } else { + // len === 8 + for (outOff = 0, t = 0; outOff < size; outOff += len, ++t) { + const off = bitrev[t]; + this._singleRealTransform4( + data, + out, + outOff, + off >>> 1, + step >>> 1, + inv, + ); + } + } + + // TODO: Optimize once https://github.com/indutny/fft.js/issues/25 is fixed + // Loop through steps in decreasing order + for (step >>= 2; step >= 2; step >>= 2) { + len = (size / step) << 1; + const quarterLen = len >>> 2; + + // Loop through offsets in the data + for (outOff = 0; outOff < size; outOff += len) { + // Full case + const limit = outOff + quarterLen - 1; + for (let i = outOff, k = 0; i < limit; i += 2, k += step) { + const A = i; + const B = A + quarterLen; + const C = B + quarterLen; + const D = C + quarterLen; + + // Original values + const Ar = out[A]; + const Ai = out[A + 1]; + const Br = out[B]; + const Bi = out[B + 1]; + const Cr = out[C]; + const Ci = out[C + 1]; + const Dr = out[D]; + const Di = out[D + 1]; + + const tableBr = this.table[k]; + const tableBi = inv * this.table[k + 1]; + const MBr = Br * tableBr - Bi * tableBi; + const MBi = Br * tableBi + Bi * tableBr; + + const tableCr = this.table[2 * k]; + const tableCi = inv * this.table[2 * k + 1]; + const MCr = Cr * tableCr - Ci * tableCi; + const MCi = Cr * tableCi + Ci * tableCr; + + const tableDr = this.table[3 * k]; + const tableDi = inv * this.table[3 * k + 1]; + const MDr = Dr * tableDr - Di * tableDi; + const MDi = Dr * tableDi + Di * tableDr; + + // Pre-Final values + const T0r = Ar + MCr; + const T0i = Ai + MCi; + const T1r = Ar - MCr; + const T1i = Ai - MCi; + const T2r = MBr + MDr; + const T2i = MBi + MDi; + const T3r = inv * (MBr - MDr); + const T3i = inv * (MBi - MDi); + + // Final values + out[A] = T0r + T2r; + out[A + 1] = T0i + T2i; + out[B] = T1r + T3i; + out[B + 1] = T1i - T3r; + out[C] = T0r - T2r; + out[C + 1] = T0i - T2i; + out[D] = T1r - T3i; + out[D + 1] = T1i + T3r; } + } } + } - /** - * Create a complex number array with size `2 * size` - * - * @returns {Float64Array} A complex number array with size `2 * size` - */ - createComplexArray() { - return new Float64Array(this._csize); - } + /** + * Performs a single real input radix-2 transformation on the provided data + * + * @param {Float64Array} data The input data array + * @param {Float64Array} out The output data array + * @param {number} outOff The output offset + * @param {number} off The input offset + * @param {number} step The step + * + * @returns {void} + */ + _singleRealTransform2(data, out, outOff, off, step) { + // radix-2 implementation + // NOTE: Only called for len=4 - /** - * Converts a complex number representation stored in a Float64Array to an array of real numbers. - * - * @param {Float64Array} complex The complex number representation to be converted. - * @param {number[]} [storage] An optional array to store the result in. - * @returns {number[]} An array of real numbers representing the input complex number representation. - */ - fromComplexArray(complex, storage) { - const res = storage || new Array(complex.length >>> 1); - for (let i = 0; i < complex.length; i += 2) - res[i >>> 1] = complex[i]; - return res; - } + const evenR = data[off]; + const oddR = data[off + step]; - /** - * Convert a real-valued input array to a complex-valued output array. - * @param {Float64Array} input The real-valued input array. - * @param {Float64Array} [storage] Optional buffer to store the output array. - * @returns {Float64Array} The complex-valued output array. - */ - toComplexArray(input, storage) { - const res = storage || this.createComplexArray(); - for (let i = 0; i < res.length; i += 2) { - res[i] = input[i >>> 1]; - res[i + 1] = 0; - } - return res; - } + out[outOff] = evenR + oddR; + out[outOff + 1] = 0; + out[outOff + 2] = evenR - oddR; + out[outOff + 3] = 0; + } - /** - * Completes the spectrum by adding its mirrored negative frequency components. - * @param {Float64Array} spectrum The input spectrum. - * @returns {void} - */ - completeSpectrum(spectrum) { - const size = this._csize; - const half = size >>> 1; - for (let i = 2; i < half; i += 2) { - spectrum[size - i] = spectrum[i]; - spectrum[size - i + 1] = -spectrum[i + 1]; - } - } + /** + * Computes a single real-valued transform using radix-4 algorithm. + * This method is only called for len=8. + * + * @param {Float64Array} data The input data array. + * @param {Float64Array} out The output data array. + * @param {number} outOff The offset into the output array. + * @param {number} off The offset into the input array. + * @param {number} step The step size for the input array. + * @param {number} inv The value of inverse. + */ + _singleRealTransform4(data, out, outOff, off, step, inv) { + // radix-4 + // NOTE: Only called for len=8 + const step2 = step * 2; + const step3 = step * 3; - /** - * Performs a Fast Fourier Transform (FFT) on the given input data and stores the result in the output buffer. - * - * @param {Float64Array} out The output buffer to store the result. - * @param {Float64Array} data The input data to transform. - * - * @throws {Error} Input and output buffers must be different. - * - * @returns {void} - */ - transform(out, data) { - if (out === data) - throw new Error('Input and output buffers must be different'); + // Original values + const Ar = data[off]; + const Br = data[off + step]; + const Cr = data[off + step2]; + const Dr = data[off + step3]; - this._transform4(out, data, 1 /* DONE */); - } + // Pre-Final values + const T0r = Ar + Cr; + const T1r = Ar - Cr; + const T2r = Br + Dr; + const T3r = inv * (Br - Dr); - /** - * Performs a real-valued forward FFT on the given input buffer and stores the result in the given output buffer. - * The input buffer must contain real values only, while the output buffer will contain complex values. The input and - * output buffers must be different. - * - * @param {Float64Array} out The output buffer. - * @param {Float64Array} data The input buffer containing real values. - * - * @throws {Error} If the input and output buffers are the same. - */ - realTransform(out, data) { - if (out === data) - throw new Error('Input and output buffers must be different'); - - this._realTransform4(out, data, 1 /* DONE */); - } - - /** - * Performs an inverse FFT transformation on the given `data` array, and stores the result in `out`. - * The `out` array must be a different buffer than the `data` array. The `out` array will contain the - * result of the transformation. The `data` array will not be modified. - * - * @param {Float64Array} out The output buffer for the transformed data. - * @param {Float64Array} data The input data to transform. - * @throws {Error} If `out` and `data` refer to the same buffer. - * @returns {void} - */ - inverseTransform(out, data) { - if (out === data) - throw new Error('Input and output buffers must be different'); - - this._transform4(out, data, -1 /* DONE */); - for (let i = 0; i < out.length; ++i) - out[i] /= this.size; - } - - /** - * Performs a radix-4 implementation of a discrete Fourier transform on a given set of data. - * - * @param {Float64Array} out The output buffer for the transformed data. - * @param {Float64Array} data The input buffer of data to be transformed. - * @param {number} inv A scaling factor to apply to the transform. - * @returns {void} - */ - _transform4(out, data, inv) { - // radix-4 implementation - - const size = this._csize; - - // Initial step (permute and transform) - const width = this._width; - let step = 1 << width; - let len = (size / step) << 1; - - let outOff; - let t; - const bitrev = this._bitrev; - if (len === 4) { - for (outOff = 0, t = 0; outOff < size; outOff += len, ++t) { - const off = bitrev[t]; - this._singleTransform2(data, out, outOff, off, step); - } - } else { - // len === 8 - for (outOff = 0, t = 0; outOff < size; outOff += len, ++t) { - const off = bitrev[t]; - this._singleTransform4(data, out, outOff, off, step, inv); - } - } - - // Loop through steps in decreasing order - for (step >>= 2; step >= 2; step >>= 2) { - len = (size / step) << 1; - const quarterLen = len >>> 2; - - // Loop through offsets in the data - for (outOff = 0; outOff < size; outOff += len) { - // Full case - const limit = outOff + quarterLen - 1; - for (let i = outOff, k = 0; i < limit; i += 2, k += step) { - const A = i; - const B = A + quarterLen; - const C = B + quarterLen; - const D = C + quarterLen; - - // Original values - const Ar = out[A]; - const Ai = out[A + 1]; - const Br = out[B]; - const Bi = out[B + 1]; - const Cr = out[C]; - const Ci = out[C + 1]; - const Dr = out[D]; - const Di = out[D + 1]; - - const tableBr = this.table[k]; - const tableBi = inv * this.table[k + 1]; - const MBr = Br * tableBr - Bi * tableBi; - const MBi = Br * tableBi + Bi * tableBr; - - const tableCr = this.table[2 * k]; - const tableCi = inv * this.table[2 * k + 1]; - const MCr = Cr * tableCr - Ci * tableCi; - const MCi = Cr * tableCi + Ci * tableCr; - - const tableDr = this.table[3 * k]; - const tableDi = inv * this.table[3 * k + 1]; - const MDr = Dr * tableDr - Di * tableDi; - const MDi = Dr * tableDi + Di * tableDr; - - // Pre-Final values - const T0r = Ar + MCr; - const T0i = Ai + MCi; - const T1r = Ar - MCr; - const T1i = Ai - MCi; - const T2r = MBr + MDr; - const T2i = MBi + MDi; - const T3r = inv * (MBr - MDr); - const T3i = inv * (MBi - MDi); - - // Final values - out[A] = T0r + T2r; - out[A + 1] = T0i + T2i; - out[B] = T1r + T3i; - out[B + 1] = T1i - T3r; - out[C] = T0r - T2r; - out[C + 1] = T0i - T2i; - out[D] = T1r - T3i; - out[D + 1] = T1i + T3r; - } - } - } - } - - /** - * Performs a radix-2 implementation of a discrete Fourier transform on a given set of data. - * - * @param {Float64Array} data The input buffer of data to be transformed. - * @param {Float64Array} out The output buffer for the transformed data. - * @param {number} outOff The offset at which to write the output data. - * @param {number} off The offset at which to begin reading the input data. - * @param {number} step The step size for indexing the input data. - * @returns {void} - */ - _singleTransform2(data, out, outOff, off, step) { - // radix-2 implementation - // NOTE: Only called for len=4 - - const evenR = data[off]; - const evenI = data[off + 1]; - const oddR = data[off + step]; - const oddI = data[off + step + 1]; - - out[outOff] = evenR + oddR; - out[outOff + 1] = evenI + oddI; - out[outOff + 2] = evenR - oddR; - out[outOff + 3] = evenI - oddI; - } - - /** - * Performs radix-4 transformation on input data of length 8 - * - * @param {Float64Array} data Input data array of length 8 - * @param {Float64Array} out Output data array of length 8 - * @param {number} outOff Index of output array to start writing from - * @param {number} off Index of input array to start reading from - * @param {number} step Step size between elements in input array - * @param {number} inv Scaling factor for inverse transform - * - * @returns {void} - */ - _singleTransform4(data, out, outOff, off, step, inv) { - // radix-4 - // NOTE: Only called for len=8 - const step2 = step * 2; - const step3 = step * 3; - - // Original values - const Ar = data[off]; - const Ai = data[off + 1]; - const Br = data[off + step]; - const Bi = data[off + step + 1]; - const Cr = data[off + step2]; - const Ci = data[off + step2 + 1]; - const Dr = data[off + step3]; - const Di = data[off + step3 + 1]; - - // Pre-Final values - const T0r = Ar + Cr; - const T0i = Ai + Ci; - const T1r = Ar - Cr; - const T1i = Ai - Ci; - const T2r = Br + Dr; - const T2i = Bi + Di; - const T3r = inv * (Br - Dr); - const T3i = inv * (Bi - Di); - - // Final values - out[outOff] = T0r + T2r; - out[outOff + 1] = T0i + T2i; - out[outOff + 2] = T1r + T3i; - out[outOff + 3] = T1i - T3r; - out[outOff + 4] = T0r - T2r; - out[outOff + 5] = T0i - T2i; - out[outOff + 6] = T1r - T3i; - out[outOff + 7] = T1i + T3r; - } - - /** - * Real input radix-4 implementation - * @param {Float64Array} out Output array for the transformed data - * @param {Float64Array} data Input array of real data to be transformed - * @param {number} inv The scale factor used to normalize the inverse transform - */ - _realTransform4(out, data, inv) { - // Real input radix-4 implementation - const size = this._csize; - - // Initial step (permute and transform) - const width = this._width; - let step = 1 << width; - let len = (size / step) << 1; - - let outOff; - let t; - const bitrev = this._bitrev; - if (len === 4) { - for (outOff = 0, t = 0; outOff < size; outOff += len, ++t) { - const off = bitrev[t]; - this._singleRealTransform2(data, out, outOff, off >>> 1, step >>> 1); - } - } else { - // len === 8 - for (outOff = 0, t = 0; outOff < size; outOff += len, ++t) { - const off = bitrev[t]; - this._singleRealTransform4(data, out, outOff, off >>> 1, step >>> 1, inv); - } - } - - // TODO: Optimize once https://github.com/indutny/fft.js/issues/25 is fixed - // Loop through steps in decreasing order - for (step >>= 2; step >= 2; step >>= 2) { - len = (size / step) << 1; - const quarterLen = len >>> 2; - - // Loop through offsets in the data - for (outOff = 0; outOff < size; outOff += len) { - // Full case - const limit = outOff + quarterLen - 1; - for (let i = outOff, k = 0; i < limit; i += 2, k += step) { - const A = i; - const B = A + quarterLen; - const C = B + quarterLen; - const D = C + quarterLen; - - // Original values - const Ar = out[A]; - const Ai = out[A + 1]; - const Br = out[B]; - const Bi = out[B + 1]; - const Cr = out[C]; - const Ci = out[C + 1]; - const Dr = out[D]; - const Di = out[D + 1]; - - const tableBr = this.table[k]; - const tableBi = inv * this.table[k + 1]; - const MBr = Br * tableBr - Bi * tableBi; - const MBi = Br * tableBi + Bi * tableBr; - - const tableCr = this.table[2 * k]; - const tableCi = inv * this.table[2 * k + 1]; - const MCr = Cr * tableCr - Ci * tableCi; - const MCi = Cr * tableCi + Ci * tableCr; - - const tableDr = this.table[3 * k]; - const tableDi = inv * this.table[3 * k + 1]; - const MDr = Dr * tableDr - Di * tableDi; - const MDi = Dr * tableDi + Di * tableDr; - - // Pre-Final values - const T0r = Ar + MCr; - const T0i = Ai + MCi; - const T1r = Ar - MCr; - const T1i = Ai - MCi; - const T2r = MBr + MDr; - const T2i = MBi + MDi; - const T3r = inv * (MBr - MDr); - const T3i = inv * (MBi - MDi); - - // Final values - out[A] = T0r + T2r; - out[A + 1] = T0i + T2i; - out[B] = T1r + T3i; - out[B + 1] = T1i - T3r; - out[C] = T0r - T2r; - out[C + 1] = T0i - T2i; - out[D] = T1r - T3i; - out[D + 1] = T1i + T3r; - } - } - } - } - - /** - * Performs a single real input radix-2 transformation on the provided data - * - * @param {Float64Array} data The input data array - * @param {Float64Array} out The output data array - * @param {number} outOff The output offset - * @param {number} off The input offset - * @param {number} step The step - * - * @returns {void} - */ - _singleRealTransform2(data, out, outOff, off, step) { - // radix-2 implementation - // NOTE: Only called for len=4 - - const evenR = data[off]; - const oddR = data[off + step]; - - out[outOff] = evenR + oddR; - out[outOff + 1] = 0; - out[outOff + 2] = evenR - oddR; - out[outOff + 3] = 0; - } - - /** - * Computes a single real-valued transform using radix-4 algorithm. - * This method is only called for len=8. - * - * @param {Float64Array} data The input data array. - * @param {Float64Array} out The output data array. - * @param {number} outOff The offset into the output array. - * @param {number} off The offset into the input array. - * @param {number} step The step size for the input array. - * @param {number} inv The value of inverse. - */ - _singleRealTransform4(data, out, outOff, off, step, inv) { - // radix-4 - // NOTE: Only called for len=8 - const step2 = step * 2; - const step3 = step * 3; - - // Original values - const Ar = data[off]; - const Br = data[off + step]; - const Cr = data[off + step2]; - const Dr = data[off + step3]; - - // Pre-Final values - const T0r = Ar + Cr; - const T1r = Ar - Cr; - const T2r = Br + Dr; - const T3r = inv * (Br - Dr); - - // Final values - out[outOff] = T0r + T2r; - out[outOff + 1] = 0; - out[outOff + 2] = T1r; - out[outOff + 3] = -T3r; - out[outOff + 4] = T0r - T2r; - out[outOff + 5] = 0; - out[outOff + 6] = T1r; - out[outOff + 7] = T3r; - } + // Final values + out[outOff] = T0r + T2r; + out[outOff + 1] = 0; + out[outOff + 2] = T1r; + out[outOff + 3] = -T3r; + out[outOff + 4] = T0r - T2r; + out[outOff + 5] = 0; + out[outOff + 6] = T1r; + out[outOff + 7] = T3r; + } } /** * NP2FFT class provides functionality for performing Fast Fourier Transform on arrays * which are not a power of two in length. In such cases, the chirp-z transform is used. - * + * * For more information, see: https://math.stackexchange.com/questions/77118/non-power-of-2-ffts/77156#77156 */ class NP2FFT { + /** + * Constructs a new NP2FFT object. + * @param {number} fft_length The length of the FFT + */ + constructor(fft_length) { + // Helper variables + const a = 2 * (fft_length - 1); + const b = 2 * (2 * fft_length - 1); + const nextP2 = 2 ** Math.ceil(Math.log2(b)); + this.bufferSize = nextP2; + this._a = a; - /** - * Constructs a new NP2FFT object. - * @param {number} fft_length The length of the FFT - */ - constructor(fft_length) { - // Helper variables - const a = 2 * (fft_length - 1); - const b = 2 * (2 * fft_length - 1); - const nextP2 = 2 ** (Math.ceil(Math.log2(b))) - this.bufferSize = nextP2; - this._a = a; + // Define buffers + // Compute chirp for transform + const chirp = new Float64Array(b); + const ichirp = new Float64Array(nextP2); + this._chirpBuffer = new Float64Array(nextP2); + this._buffer1 = new Float64Array(nextP2); + this._buffer2 = new Float64Array(nextP2); + this._outBuffer1 = new Float64Array(nextP2); + this._outBuffer2 = new Float64Array(nextP2); - // Define buffers - // Compute chirp for transform - const chirp = new Float64Array(b); - const ichirp = new Float64Array(nextP2); - this._chirpBuffer = new Float64Array(nextP2); - this._buffer1 = new Float64Array(nextP2); - this._buffer2 = new Float64Array(nextP2); - this._outBuffer1 = new Float64Array(nextP2); - this._outBuffer2 = new Float64Array(nextP2); + // Compute complex exponentiation + const theta = (-2 * Math.PI) / fft_length; + const baseR = Math.cos(theta); + const baseI = Math.sin(theta); - // Compute complex exponentiation - const theta = -2 * Math.PI / fft_length; - const baseR = Math.cos(theta); - const baseI = Math.sin(theta); + // Precompute helper for chirp-z transform + for (let i = 0; i < b >> 1; ++i) { + // Compute complex power: + const e = (i + 1 - fft_length) ** 2 / 2.0; - // Precompute helper for chirp-z transform - for (let i = 0; i < b >> 1; ++i) { - // Compute complex power: - const e = (i + 1 - fft_length) ** 2 / 2.0; + // Compute the modulus and argument of the result + const result_mod = Math.sqrt(baseR ** 2 + baseI ** 2) ** e; + const result_arg = e * Math.atan2(baseI, baseR); - // Compute the modulus and argument of the result - const result_mod = Math.sqrt(baseR ** 2 + baseI ** 2) ** e; - const result_arg = e * Math.atan2(baseI, baseR); + // Convert the result back to rectangular form + // and assign to chirp and ichirp + const i2 = 2 * i; + chirp[i2] = result_mod * Math.cos(result_arg); + chirp[i2 + 1] = result_mod * Math.sin(result_arg); - // Convert the result back to rectangular form - // and assign to chirp and ichirp - const i2 = 2 * i; - chirp[i2] = result_mod * Math.cos(result_arg); - chirp[i2 + 1] = result_mod * Math.sin(result_arg); - - // conjugate - ichirp[i2] = chirp[i2]; - ichirp[i2 + 1] = - chirp[i2 + 1]; - } - this._slicedChirpBuffer = chirp.subarray(a, b); - - // create object to perform Fast Fourier Transforms - // with `nextP2` complex numbers - this._f = new P2FFT(nextP2 >> 1); - this._f.transform(this._chirpBuffer, ichirp); + // conjugate + ichirp[i2] = chirp[i2]; + ichirp[i2 + 1] = -chirp[i2 + 1]; } + this._slicedChirpBuffer = chirp.subarray(a, b); - _transform(output, input, real) { - const ib1 = this._buffer1; - const ib2 = this._buffer2; - const ob2 = this._outBuffer1; - const ob3 = this._outBuffer2; - const cb = this._chirpBuffer; - const sb = this._slicedChirpBuffer; - const a = this._a; + // create object to perform Fast Fourier Transforms + // with `nextP2` complex numbers + this._f = new P2FFT(nextP2 >> 1); + this._f.transform(this._chirpBuffer, ichirp); + } - if (real) { - // Real multiplication - for (let j = 0; j < sb.length; j += 2) { - const j2 = j + 1 - const j3 = j >> 1; + _transform(output, input, real) { + const ib1 = this._buffer1; + const ib2 = this._buffer2; + const ob2 = this._outBuffer1; + const ob3 = this._outBuffer2; + const cb = this._chirpBuffer; + const sb = this._slicedChirpBuffer; + const a = this._a; - const a_real = input[j3]; - ib1[j] = a_real * sb[j]; - ib1[j2] = a_real * sb[j2]; - } - } else { - // Complex multiplication - for (let j = 0; j < sb.length; j += 2) { - const j2 = j + 1 - ib1[j] = input[j] * sb[j] - input[j2] * sb[j2]; - ib1[j2] = input[j] * sb[j2] + input[j2] * sb[j]; - } - } - this._f.transform(ob2, ib1); + if (real) { + // Real multiplication + for (let j = 0; j < sb.length; j += 2) { + const j2 = j + 1; + const j3 = j >> 1; - for (let j = 0; j < cb.length; j += 2) { - const j2 = j + 1; - - ib2[j] = ob2[j] * cb[j] - ob2[j2] * cb[j2]; - ib2[j2] = ob2[j] * cb[j2] + ob2[j2] * cb[j]; - } - this._f.inverseTransform(ob3, ib2); - - for (let j = 0; j < ob3.length; j += 2) { - const a_real = ob3[j + a]; - const a_imag = ob3[j + a + 1]; - const b_real = sb[j]; - const b_imag = sb[j + 1]; - - output[j] = a_real * b_real - a_imag * b_imag; - output[j + 1] = a_real * b_imag + a_imag * b_real; - } + const a_real = input[j3]; + ib1[j] = a_real * sb[j]; + ib1[j2] = a_real * sb[j2]; + } + } else { + // Complex multiplication + for (let j = 0; j < sb.length; j += 2) { + const j2 = j + 1; + ib1[j] = input[j] * sb[j] - input[j2] * sb[j2]; + ib1[j2] = input[j] * sb[j2] + input[j2] * sb[j]; + } } + this._f.transform(ob2, ib1); - transform(output, input) { - this._transform(output, input, false); - } + for (let j = 0; j < cb.length; j += 2) { + const j2 = j + 1; - realTransform(output, input) { - this._transform(output, input, true); + ib2[j] = ob2[j] * cb[j] - ob2[j2] * cb[j2]; + ib2[j2] = ob2[j] * cb[j2] + ob2[j2] * cb[j]; } + this._f.inverseTransform(ob3, ib2); + + for (let j = 0; j < ob3.length; j += 2) { + const a_real = ob3[j + a]; + const a_imag = ob3[j + a + 1]; + const b_real = sb[j]; + const b_imag = sb[j + 1]; + + output[j] = a_real * b_real - a_imag * b_imag; + output[j + 1] = a_real * b_imag + a_imag * b_real; + } + } + + transform(output, input) { + this._transform(output, input, false); + } + + realTransform(output, input) { + this._transform(output, input, true); + } } export class FFT { - constructor(fft_length) { - this.fft_length = fft_length; - this.isPowerOfTwo = isPowerOfTwo(fft_length); - if (this.isPowerOfTwo) { - this.fft = new P2FFT(fft_length); - this.outputBufferSize = 2 * fft_length; - } else { - this.fft = new NP2FFT(fft_length); - this.outputBufferSize = this.fft.bufferSize; - } + constructor(fft_length) { + this.fft_length = fft_length; + this.isPowerOfTwo = isPowerOfTwo(fft_length); + if (this.isPowerOfTwo) { + this.fft = new P2FFT(fft_length); + this.outputBufferSize = 2 * fft_length; + } else { + this.fft = new NP2FFT(fft_length); + this.outputBufferSize = this.fft.bufferSize; } + } - realTransform(out, input) { - this.fft.realTransform(out, input); - } + realTransform(out, input) { + this.fft.realTransform(out, input); + } - transform(out, input) { - this.fft.transform(out, input); - } + transform(out, input) { + this.fft.transform(out, input); + } } - /** * Performs median filter on the provided data. Padding is done by mirroring the data. * @param {AnyTypedArray} data The input array * @param {number} windowSize The window size */ export function medianFilter(data, windowSize) { + if (windowSize % 2 === 0 || windowSize <= 0) { + throw new Error("Window size must be a positive odd number"); + } - if (windowSize % 2 === 0 || windowSize <= 0) { - throw new Error('Window size must be a positive odd number'); + // @ts-ignore + const outputArray = new data.constructor(data.length); + + // @ts-ignore + const buffer = new data.constructor(windowSize); // Reusable array for storing values + + const halfWindowSize = Math.floor(windowSize / 2); + + for (let i = 0; i < data.length; ++i) { + let valuesIndex = 0; + + for (let j = -halfWindowSize; j <= halfWindowSize; ++j) { + let index = i + j; + if (index < 0) { + index = Math.abs(index); + } else if (index >= data.length) { + index = 2 * (data.length - 1) - index; + } + + buffer[valuesIndex++] = data[index]; } - // @ts-ignore - const outputArray = new data.constructor(data.length); + buffer.sort(); + outputArray[i] = buffer[halfWindowSize]; + } - // @ts-ignore - const buffer = new data.constructor(windowSize); // Reusable array for storing values - - const halfWindowSize = Math.floor(windowSize / 2); - - for (let i = 0; i < data.length; ++i) { - let valuesIndex = 0; - - for (let j = -halfWindowSize; j <= halfWindowSize; ++j) { - let index = i + j; - if (index < 0) { - index = Math.abs(index); - } else if (index >= data.length) { - index = 2 * (data.length - 1) - index; - } - - buffer[valuesIndex++] = data[index]; - } - - buffer.sort(); - outputArray[i] = buffer[halfWindowSize]; - } - - return outputArray; + return outputArray; } /** @@ -949,6 +949,6 @@ export function medianFilter(data, windowSize) { * @returns {number} The rounded number */ export function round(num, decimals) { - const pow = Math.pow(10, decimals); - return Math.round(num * pow) / pow; + const pow = Math.pow(10, decimals); + return Math.round(num * pow) / pow; } diff --git a/core/vendor/modules/@xenova/transformers/src/utils/tensor.js b/core/vendor/modules/@xenova/transformers/src/utils/tensor.js index 74cb23880..e19010b73 100644 --- a/core/vendor/modules/@xenova/transformers/src/utils/tensor.js +++ b/core/vendor/modules/@xenova/transformers/src/utils/tensor.js @@ -1,33 +1,29 @@ /** * @file Helper module for `Tensor` processing. - * - * These functions and classes are only used internally, + * + * These functions and classes are only used internally, * meaning an end-user shouldn't need to access anything here. - * + * * @module utils/tensor */ -import { ONNX } from '../backends/onnx.js'; - -import { - interpolate_data, - transpose_data -} from './maths.js'; +import { ONNX } from "../backends/onnx.js"; +import { interpolate_data, transpose_data } from "./maths.js"; const DataTypeMap = Object.freeze({ - float32: Float32Array, - float64: Float64Array, - string: Array, // string[] - int8: Int8Array, - uint8: Uint8Array, - int16: Int16Array, - uint16: Uint16Array, - int32: Int32Array, - uint32: Uint32Array, - int64: BigInt64Array, - uint64: BigUint64Array, - bool: Uint8Array, + float32: Float32Array, + float64: Float64Array, + string: Array, // string[] + int8: Int8Array, + uint8: Uint8Array, + int16: Int16Array, + uint16: Uint16Array, + int32: Int32Array, + uint32: Uint32Array, + int64: BigInt64Array, + uint64: BigUint64Array, + bool: Uint8Array, }); /** @@ -38,592 +34,598 @@ const DataTypeMap = Object.freeze({ const ONNXTensor = ONNX.Tensor; export class Tensor { - /** @type {number[]} Dimensions of the tensor. */ - dims; + /** @type {number[]} Dimensions of the tensor. */ + dims; - /** @type {DataType} Type of the tensor. */ - type; + /** @type {DataType} Type of the tensor. */ + type; - /** @type {DataArray} The data stored in the tensor. */ - data; + /** @type {DataArray} The data stored in the tensor. */ + data; - /** @type {number} The number of elements in the tensor. */ - size; + /** @type {number} The number of elements in the tensor. */ + size; - /** - * Create a new Tensor or copy an existing Tensor. - * @param {[DataType, DataArray, number[]]|[import('onnxruntime-common').Tensor]} args - */ - constructor(...args) { - if (args[0] instanceof ONNXTensor) { - // Create shallow copy - Object.assign(this, args[0]); + /** + * Create a new Tensor or copy an existing Tensor. + * @param {[DataType, DataArray, number[]]|[import('onnxruntime-common').Tensor]} args + */ + constructor(...args) { + if (args[0] instanceof ONNXTensor) { + // Create shallow copy + Object.assign(this, args[0]); + } else { + // Create new tensor + Object.assign( + this, + new ONNXTensor( + /** @type {DataType} */ (args[0]), + /** @type {Exclude} */ ( + args[1] + ), + args[2], + ), + ); + } - } else { - // Create new tensor - Object.assign(this, new ONNXTensor( - /** @type {DataType} */(args[0]), - /** @type {Exclude} */(args[1]), - args[2] - )); + return new Proxy(this, { + get: (obj, key) => { + if (typeof key === "string") { + let index = Number(key); + if (Number.isInteger(index)) { + // key is an integer (i.e., index) + return obj._getitem(index); + } } - - return new Proxy(this, { - get: (obj, key) => { - if (typeof key === 'string') { - let index = Number(key); - if (Number.isInteger(index)) { - // key is an integer (i.e., index) - return obj._getitem(index); - } - } - // @ts-ignore - return obj[key]; - }, - set: (obj, key, value) => { - // TODO allow setting of data - - // @ts-ignore - return obj[key] = value; - } - }); - } - - /** - * Returns an iterator object for iterating over the tensor data in row-major order. - * If the tensor has more than one dimension, the iterator will yield subarrays. - * @returns {Iterator} An iterator object for iterating over the tensor data in row-major order. - */ - *[Symbol.iterator]() { - const [iterLength, ...iterDims] = this.dims; - - if (iterDims.length > 0) { - const iterSize = iterDims.reduce((a, b) => a * b); - for (let i = 0; i < iterLength; ++i) { - yield this._subarray(i, iterSize, iterDims); - } - } else { - yield* this.data - } - - } - - /** - * Index into a Tensor object. - * @param {number} index The index to access. - * @returns {Tensor} The data at the specified index. - */ - _getitem(index) { - const [iterLength, ...iterDims] = this.dims; - - index = safeIndex(index, iterLength); - - if (iterDims.length > 0) { - const iterSize = iterDims.reduce((a, b) => a * b); - return this._subarray(index, iterSize, iterDims); - } else { - return new Tensor(this.type, [this.data[index]], iterDims); - } - } - - /** - * @param {number|bigint} item The item to search for in the tensor - * @returns {number} The index of the first occurrence of item in the tensor data. - */ - indexOf(item) { - for (let index = 0; index < this.data.length; ++index) { - // Note: == instead of === so we can match Ints with BigInts - if (this.data[index] == item) { - return index; - } - } - return -1; - } - - /** - * @param {number} index - * @param {number} iterSize - * @param {any} iterDims - * @returns {Tensor} - */ - _subarray(index, iterSize, iterDims) { - const o1 = index * iterSize; - const o2 = (index + 1) * iterSize; - - // We use subarray if available (typed array), otherwise we use slice (normal array) - const data = - ('subarray' in this.data) - ? this.data.subarray(o1, o2) - : this.data.slice(o1, o2); - return new Tensor(this.type, data, iterDims); - } - - /** - * Returns the value of this tensor as a standard JavaScript Number. This only works - * for tensors with one element. For other cases, see `Tensor.tolist()`. - * @returns {number|bigint} The value of this tensor as a standard JavaScript Number. - * @throws {Error} If the tensor has more than one element. - */ - item() { - if (this.data.length !== 1) { - throw new Error(`a Tensor with ${this.data.length} elements cannot be converted to Scalar`); - } - return this.data[0]; - } - - /** - * Convert tensor data to a n-dimensional JS list - * @returns {Array} - */ - tolist() { - return reshape(this.data, this.dims) - } - - /** - * Return a new Tensor with the sigmoid function applied to each element. - * @returns {Tensor} The tensor with the sigmoid function applied. - */ - sigmoid() { - return this.clone().sigmoid_(); - } - - /** - * Applies the sigmoid function to the tensor in place. - * @returns {Tensor} Returns `this`. - */ - sigmoid_() { - for (let i = 0; i < this.data.length; ++i) { - this.data[i] = 1 / (1 + Math.exp(-this.data[i])); - } - return this; - } - - /** - * Return a new Tensor with every element multiplied by a constant. - * @param {number} val The value to multiply by. - * @returns {Tensor} The new tensor. - */ - mul(val) { - return this.clone().mul_(val); - } - - /** - * Multiply the tensor by a constant in place. - * @param {number} val The value to multiply by. - * @returns {Tensor} Returns `this`. - */ - mul_(val) { - for (let i = 0; i < this.data.length; ++i) { - this.data[i] *= val; - } - return this; - } - - - /** - * Return a new Tensor with every element added by a constant. - * @param {number} val The value to add by. - * @returns {Tensor} The new tensor. - */ - add(val) { - return this.clone().add_(val); - } - - /** - * Add the tensor by a constant in place. - * @param {number} val The value to add by. - * @returns {Tensor} Returns `this`. - */ - add_(val) { - for (let i = 0; i < this.data.length; ++i) { - this.data[i] += val; - } - return this; - } - clone() { - return new Tensor(this.type, this.data.slice(), this.dims.slice()); - } - - slice(...slices) { - // This allows for slicing with ranges and numbers - let newTensorDims = []; - let newOffsets = []; - - // slices is an array of numbers or arrays of numbers - // e.g., slices = [0, [1, 3], null, [0, 3]] - for (let sliceIndex = 0; sliceIndex < this.dims.length; ++sliceIndex) { - let slice = slices[sliceIndex]; - - if (slice === null || slice === undefined) { - // null or undefined means take the whole dimension - newOffsets.push([0, this.dims[sliceIndex]]); - newTensorDims.push(this.dims[sliceIndex]); - - } else if (typeof slice === 'number') { - slice = safeIndex(slice, this.dims[sliceIndex], sliceIndex); - - // A number means take a single element - newOffsets.push([slice, slice + 1]); - - } else if (Array.isArray(slice) && slice.length === 2) { - // An array of length 2 means take a range of elements - - if (slice[0] > slice[1]) { - throw new Error(`Invalid slice: ${slice}`); - } - - let offsets = [ - Math.max(slice[0], 0), - Math.min(slice[1], this.dims[sliceIndex]) - ]; - - newOffsets.push(offsets); - newTensorDims.push(offsets[1] - offsets[0]); - - } else { - throw new Error(`Invalid slice: ${slice}`); - } - } - - let newDims = newOffsets.map(([start, end]) => end - start); - let newBufferSize = newDims.reduce((a, b) => a * b); - - // Allocate memory // @ts-ignore - let data = new this.data.constructor(newBufferSize); + return obj[key]; + }, + set: (obj, key, value) => { + // TODO allow setting of data - // Precompute strides - const stride = this.stride(); - - for (let i = 0; i < newBufferSize; ++i) { - let originalIndex = 0; - for (let j = newDims.length - 1, num = i; j >= 0; --j) { - const size = newDims[j]; - originalIndex += ((num % size) + newOffsets[j][0]) * stride[j]; - num = Math.floor(num / size); - } - data[i] = this.data[originalIndex]; - } - return new Tensor(this.type, data, newTensorDims); - - } - - /** - * Return a transposed version of this Tensor, according to the provided dimensions. - * @param {...number} dims Dimensions to transpose. - * @returns {Tensor} The transposed tensor. - */ - transpose(...dims) { - return transpose(this, dims); - } - - // TODO: rename transpose to permute - // TODO: implement transpose - - // TODO add .max() and .min() methods - - /** - * Returns the sum of each row of the input tensor in the given dimension dim. - * - * @param {number} [dim=null] The dimension or dimensions to reduce. If `null`, all dimensions are reduced. - * @param {boolean} keepdim Whether the output tensor has `dim` retained or not. - * @returns The summed tensor - */ - sum(dim = null, keepdim = false) { - return this.norm(1, dim, keepdim); - } - - /** - * Returns the matrix norm or vector norm of a given tensor. - * @param {number|string} [p='fro'] The order of norm - * @param {number} [dim=null] Specifies which dimension of the tensor to calculate the norm across. - * If dim is None, the norm will be calculated across all dimensions of input. - * @param {boolean} [keepdim=false] Whether the output tensors have dim retained or not. - * @returns {Tensor} The norm of the tensor. - */ - norm(p = 'fro', dim = null, keepdim = false) { - if (p === 'fro') { - // NOTE: Since we only support integer dims, Frobenius norm produces the same result as p=2. - p = 2; - } else if (typeof p === 'string') { - throw Error(`Unsupported norm: ${p}`); - } - - if (dim === null) { - // @ts-ignore - let val = this.data.reduce((a, b) => a + (b ** p), 0) ** (1 / p); - return new Tensor(this.type, [val], []); - } - - // Negative indexing - dim = safeIndex(dim, this.dims.length); - - // Calculate the shape of the resulting array after summation - const resultDims = this.dims.slice(); // Copy the original dimensions - resultDims[dim] = 1; // Remove the specified axis - - // Create a new array to store the accumulated values // @ts-ignore - const result = new this.data.constructor(this.data.length / this.dims[dim]); + return (obj[key] = value); + }, + }); + } - // Iterate over the data array - for (let i = 0; i < this.data.length; ++i) { + /** + * Returns an iterator object for iterating over the tensor data in row-major order. + * If the tensor has more than one dimension, the iterator will yield subarrays. + * @returns {Iterator} An iterator object for iterating over the tensor data in row-major order. + */ + *[Symbol.iterator]() { + const [iterLength, ...iterDims] = this.dims; - // Calculate the index in the resulting array - let resultIndex = 0; + if (iterDims.length > 0) { + const iterSize = iterDims.reduce((a, b) => a * b); + for (let i = 0; i < iterLength; ++i) { + yield this._subarray(i, iterSize, iterDims); + } + } else { + yield* this.data; + } + } - for (let j = this.dims.length - 1, num = i, resultMultiplier = 1; j >= 0; --j) { - const size = this.dims[j]; - if (j !== dim) { - const index = num % size; - resultIndex += index * resultMultiplier; - resultMultiplier *= resultDims[j]; - } - num = Math.floor(num / size); - } + /** + * Index into a Tensor object. + * @param {number} index The index to access. + * @returns {Tensor} The data at the specified index. + */ + _getitem(index) { + const [iterLength, ...iterDims] = this.dims; - // Accumulate the value at the current index - result[resultIndex] += (this.data[i]) ** p; + index = safeIndex(index, iterLength); + + if (iterDims.length > 0) { + const iterSize = iterDims.reduce((a, b) => a * b); + return this._subarray(index, iterSize, iterDims); + } else { + return new Tensor(this.type, [this.data[index]], iterDims); + } + } + + /** + * @param {number|bigint} item The item to search for in the tensor + * @returns {number} The index of the first occurrence of item in the tensor data. + */ + indexOf(item) { + for (let index = 0; index < this.data.length; ++index) { + // Note: == instead of === so we can match Ints with BigInts + if (this.data[index] == item) { + return index; + } + } + return -1; + } + + /** + * @param {number} index + * @param {number} iterSize + * @param {any} iterDims + * @returns {Tensor} + */ + _subarray(index, iterSize, iterDims) { + const o1 = index * iterSize; + const o2 = (index + 1) * iterSize; + + // We use subarray if available (typed array), otherwise we use slice (normal array) + const data = + "subarray" in this.data + ? this.data.subarray(o1, o2) + : this.data.slice(o1, o2); + return new Tensor(this.type, data, iterDims); + } + + /** + * Returns the value of this tensor as a standard JavaScript Number. This only works + * for tensors with one element. For other cases, see `Tensor.tolist()`. + * @returns {number|bigint} The value of this tensor as a standard JavaScript Number. + * @throws {Error} If the tensor has more than one element. + */ + item() { + if (this.data.length !== 1) { + throw new Error( + `a Tensor with ${this.data.length} elements cannot be converted to Scalar`, + ); + } + return this.data[0]; + } + + /** + * Convert tensor data to a n-dimensional JS list + * @returns {Array} + */ + tolist() { + return reshape(this.data, this.dims); + } + + /** + * Return a new Tensor with the sigmoid function applied to each element. + * @returns {Tensor} The tensor with the sigmoid function applied. + */ + sigmoid() { + return this.clone().sigmoid_(); + } + + /** + * Applies the sigmoid function to the tensor in place. + * @returns {Tensor} Returns `this`. + */ + sigmoid_() { + for (let i = 0; i < this.data.length; ++i) { + this.data[i] = 1 / (1 + Math.exp(-this.data[i])); + } + return this; + } + + /** + * Return a new Tensor with every element multiplied by a constant. + * @param {number} val The value to multiply by. + * @returns {Tensor} The new tensor. + */ + mul(val) { + return this.clone().mul_(val); + } + + /** + * Multiply the tensor by a constant in place. + * @param {number} val The value to multiply by. + * @returns {Tensor} Returns `this`. + */ + mul_(val) { + for (let i = 0; i < this.data.length; ++i) { + this.data[i] *= val; + } + return this; + } + + /** + * Return a new Tensor with every element added by a constant. + * @param {number} val The value to add by. + * @returns {Tensor} The new tensor. + */ + add(val) { + return this.clone().add_(val); + } + + /** + * Add the tensor by a constant in place. + * @param {number} val The value to add by. + * @returns {Tensor} Returns `this`. + */ + add_(val) { + for (let i = 0; i < this.data.length; ++i) { + this.data[i] += val; + } + return this; + } + clone() { + return new Tensor(this.type, this.data.slice(), this.dims.slice()); + } + + slice(...slices) { + // This allows for slicing with ranges and numbers + let newTensorDims = []; + let newOffsets = []; + + // slices is an array of numbers or arrays of numbers + // e.g., slices = [0, [1, 3], null, [0, 3]] + for (let sliceIndex = 0; sliceIndex < this.dims.length; ++sliceIndex) { + let slice = slices[sliceIndex]; + + if (slice === null || slice === undefined) { + // null or undefined means take the whole dimension + newOffsets.push([0, this.dims[sliceIndex]]); + newTensorDims.push(this.dims[sliceIndex]); + } else if (typeof slice === "number") { + slice = safeIndex(slice, this.dims[sliceIndex], sliceIndex); + + // A number means take a single element + newOffsets.push([slice, slice + 1]); + } else if (Array.isArray(slice) && slice.length === 2) { + // An array of length 2 means take a range of elements + + if (slice[0] > slice[1]) { + throw new Error(`Invalid slice: ${slice}`); } - if (p !== 1) { - for (let i = 0; i < result.length; ++i) { - result[i] = result[i] ** (1 / p); - } + let offsets = [ + Math.max(slice[0], 0), + Math.min(slice[1], this.dims[sliceIndex]), + ]; + + newOffsets.push(offsets); + newTensorDims.push(offsets[1] - offsets[0]); + } else { + throw new Error(`Invalid slice: ${slice}`); + } + } + + let newDims = newOffsets.map(([start, end]) => end - start); + let newBufferSize = newDims.reduce((a, b) => a * b); + + // Allocate memory + // @ts-ignore + let data = new this.data.constructor(newBufferSize); + + // Precompute strides + const stride = this.stride(); + + for (let i = 0; i < newBufferSize; ++i) { + let originalIndex = 0; + for (let j = newDims.length - 1, num = i; j >= 0; --j) { + const size = newDims[j]; + originalIndex += ((num % size) + newOffsets[j][0]) * stride[j]; + num = Math.floor(num / size); + } + data[i] = this.data[originalIndex]; + } + return new Tensor(this.type, data, newTensorDims); + } + + /** + * Return a transposed version of this Tensor, according to the provided dimensions. + * @param {...number} dims Dimensions to transpose. + * @returns {Tensor} The transposed tensor. + */ + transpose(...dims) { + return transpose(this, dims); + } + + // TODO: rename transpose to permute + // TODO: implement transpose + + // TODO add .max() and .min() methods + + /** + * Returns the sum of each row of the input tensor in the given dimension dim. + * + * @param {number} [dim=null] The dimension or dimensions to reduce. If `null`, all dimensions are reduced. + * @param {boolean} keepdim Whether the output tensor has `dim` retained or not. + * @returns The summed tensor + */ + sum(dim = null, keepdim = false) { + return this.norm(1, dim, keepdim); + } + + /** + * Returns the matrix norm or vector norm of a given tensor. + * @param {number|string} [p='fro'] The order of norm + * @param {number} [dim=null] Specifies which dimension of the tensor to calculate the norm across. + * If dim is None, the norm will be calculated across all dimensions of input. + * @param {boolean} [keepdim=false] Whether the output tensors have dim retained or not. + * @returns {Tensor} The norm of the tensor. + */ + norm(p = "fro", dim = null, keepdim = false) { + if (p === "fro") { + // NOTE: Since we only support integer dims, Frobenius norm produces the same result as p=2. + p = 2; + } else if (typeof p === "string") { + throw Error(`Unsupported norm: ${p}`); + } + + if (dim === null) { + // @ts-ignore + let val = this.data.reduce((a, b) => a + b ** p, 0) ** (1 / p); + return new Tensor(this.type, [val], []); + } + + // Negative indexing + dim = safeIndex(dim, this.dims.length); + + // Calculate the shape of the resulting array after summation + const resultDims = this.dims.slice(); // Copy the original dimensions + resultDims[dim] = 1; // Remove the specified axis + + // Create a new array to store the accumulated values + // @ts-ignore + const result = new this.data.constructor(this.data.length / this.dims[dim]); + + // Iterate over the data array + for (let i = 0; i < this.data.length; ++i) { + // Calculate the index in the resulting array + let resultIndex = 0; + + for ( + let j = this.dims.length - 1, num = i, resultMultiplier = 1; + j >= 0; + --j + ) { + const size = this.dims[j]; + if (j !== dim) { + const index = num % size; + resultIndex += index * resultMultiplier; + resultMultiplier *= resultDims[j]; } + num = Math.floor(num / size); + } - if (!keepdim) { - resultDims.splice(dim, 1); + // Accumulate the value at the current index + result[resultIndex] += this.data[i] ** p; + } + + if (p !== 1) { + for (let i = 0; i < result.length; ++i) { + result[i] = result[i] ** (1 / p); + } + } + + if (!keepdim) { + resultDims.splice(dim, 1); + } + + return new Tensor(this.type, result, resultDims); + } + + /** + * Performs `L_p` normalization of inputs over specified dimension. Operates in place. + * @param {number} [p=2] The exponent value in the norm formulation + * @param {number} [dim=1] The dimension to reduce + * @returns {Tensor} `this` for operation chaining. + */ + normalize_(p = 2.0, dim = 1) { + dim = safeIndex(dim, this.dims.length); + + const norm = this.norm(p, dim, true); + + for (let i = 0; i < this.data.length; ++i) { + // Calculate the index in the resulting array + let resultIndex = 0; + + for ( + let j = this.dims.length - 1, num = i, resultMultiplier = 1; + j >= 0; + --j + ) { + const size = this.dims[j]; + if (j !== dim) { + const index = num % size; + resultIndex += index * resultMultiplier; + resultMultiplier *= this.dims[j]; } + num = Math.floor(num / size); + } - return new Tensor(this.type, result, resultDims); + // Divide by normalized value + this.data[i] /= norm.data[resultIndex]; } - /** - * Performs `L_p` normalization of inputs over specified dimension. Operates in place. - * @param {number} [p=2] The exponent value in the norm formulation - * @param {number} [dim=1] The dimension to reduce - * @returns {Tensor} `this` for operation chaining. - */ - normalize_(p = 2.0, dim = 1) { - dim = safeIndex(dim, this.dims.length); + return this; + } - const norm = this.norm(p, dim, true); + /** + * Performs `L_p` normalization of inputs over specified dimension. + * @param {number} [p=2] The exponent value in the norm formulation + * @param {number} [dim=1] The dimension to reduce + * @returns {Tensor} The normalized tensor. + */ + normalize(p = 2.0, dim = 1) { + return this.clone().normalize_(p, dim); + } - for (let i = 0; i < this.data.length; ++i) { + /** + * Compute and return the stride of this tensor. + * Stride is the jump necessary to go from one element to the next one in the specified dimension dim. + * @returns {number[]} The stride of this tensor. + */ + stride() { + return dimsToStride(this.dims); + } - // Calculate the index in the resulting array - let resultIndex = 0; + /** + * Returns a tensor with all specified dimensions of input of size 1 removed. + * + * NOTE: The returned tensor shares the storage with the input tensor, so changing the contents of one will change the contents of the other. + * If you would like a copy, use `tensor.clone()` before squeezing. + * + * @param {number} [dim=null] If given, the input will be squeezed only in the specified dimensions. + * @returns The squeezed tensor + */ + squeeze(dim = null) { + return new Tensor(this.type, this.data, calc_squeeze_dims(this.dims, dim)); + } - for (let j = this.dims.length - 1, num = i, resultMultiplier = 1; j >= 0; --j) { - const size = this.dims[j]; - if (j !== dim) { - const index = num % size; - resultIndex += index * resultMultiplier; - resultMultiplier *= this.dims[j]; - } - num = Math.floor(num / size); - } + /** + * In-place version of @see {@link Tensor.squeeze} + */ + squeeze_(dim = null) { + this.dims = calc_squeeze_dims(this.dims, dim); + return this; + } - // Divide by normalized value - this.data[i] /= norm.data[resultIndex]; - } + /** + * Returns a new tensor with a dimension of size one inserted at the specified position. + * + * NOTE: The returned tensor shares the same underlying data with this tensor. + * + * @param {number} dim The index at which to insert the singleton dimension + * @returns The unsqueezed tensor + */ + unsqueeze(dim = null) { + return new Tensor( + this.type, + this.data, + calc_unsqueeze_dims(this.dims, dim), + ); + } - return this; - } + /** + * In-place version of @see {@link Tensor.unsqueeze} + */ + unsqueeze_(dim = null) { + this.dims = calc_unsqueeze_dims(this.dims, dim); + return this; + } - /** - * Performs `L_p` normalization of inputs over specified dimension. - * @param {number} [p=2] The exponent value in the norm formulation - * @param {number} [dim=1] The dimension to reduce - * @returns {Tensor} The normalized tensor. - */ - normalize(p = 2.0, dim = 1) { - return this.clone().normalize_(p, dim); - } + /** + * In-place version of @see {@link Tensor.flatten} + */ + flatten_(start_dim = 0, end_dim = -1) { + // TODO validate inputs + end_dim = (end_dim + this.dims.length) % this.dims.length; - /** - * Compute and return the stride of this tensor. - * Stride is the jump necessary to go from one element to the next one in the specified dimension dim. - * @returns {number[]} The stride of this tensor. - */ - stride() { - return dimsToStride(this.dims); - } + let dimsToKeepBefore = this.dims.slice(0, start_dim); + let dimsToFlatten = this.dims.slice(start_dim, end_dim + 1); + let dimsToKeepAfter = this.dims.slice(end_dim + 1); - /** - * Returns a tensor with all specified dimensions of input of size 1 removed. - * - * NOTE: The returned tensor shares the storage with the input tensor, so changing the contents of one will change the contents of the other. - * If you would like a copy, use `tensor.clone()` before squeezing. - * - * @param {number} [dim=null] If given, the input will be squeezed only in the specified dimensions. - * @returns The squeezed tensor - */ - squeeze(dim = null) { - return new Tensor( - this.type, - this.data, - calc_squeeze_dims(this.dims, dim) - ) - } + this.dims = [ + ...dimsToKeepBefore, + dimsToFlatten.reduce((a, b) => a * b, 1), + ...dimsToKeepAfter, + ]; + return this; + } - /** - * In-place version of @see {@link Tensor.squeeze} - */ - squeeze_(dim = null) { - this.dims = calc_squeeze_dims(this.dims, dim); - return this; - } - - /** - * Returns a new tensor with a dimension of size one inserted at the specified position. - * - * NOTE: The returned tensor shares the same underlying data with this tensor. - * - * @param {number} dim The index at which to insert the singleton dimension - * @returns The unsqueezed tensor - */ - unsqueeze(dim = null) { - return new Tensor( - this.type, - this.data, - calc_unsqueeze_dims(this.dims, dim) - ); - } - - /** - * In-place version of @see {@link Tensor.unsqueeze} - */ - unsqueeze_(dim = null) { - this.dims = calc_unsqueeze_dims(this.dims, dim); - return this; - } - - /** - * In-place version of @see {@link Tensor.flatten} - */ - flatten_(start_dim = 0, end_dim = -1) { - // TODO validate inputs - end_dim = (end_dim + this.dims.length) % this.dims.length; - - let dimsToKeepBefore = this.dims.slice(0, start_dim); - let dimsToFlatten = this.dims.slice(start_dim, end_dim + 1); - let dimsToKeepAfter = this.dims.slice(end_dim + 1); - - this.dims = [...dimsToKeepBefore, dimsToFlatten.reduce((a, b) => a * b, 1), ...dimsToKeepAfter] - return this; - } - - /** - * Flattens input by reshaping it into a one-dimensional tensor. - * If `start_dim` or `end_dim` are passed, only dimensions starting with `start_dim` - * and ending with `end_dim` are flattened. The order of elements in input is unchanged. - * @param {number} start_dim the first dim to flatten - * @param {number} end_dim the last dim to flatten - * @returns The flattened tensor. - */ - flatten(start_dim = 0, end_dim = -1) { - return this.clone().flatten_(start_dim, end_dim); - } - - /** - * Returns a new tensor with the same data as the `self` tensor but of a different `shape`. - * @param {...number} dims the desired size - * @returns {Tensor} The tensor with the same data but different shape - */ - view(...dims) { - // TODO: validate dims - let inferredIndex = -1; - for (let i = 0; i < dims.length; ++i) { - if (dims[i] === -1) { - if (inferredIndex !== -1) { - throw new Error("Only one dimension can be inferred"); - } - inferredIndex = i; - } - } + /** + * Flattens input by reshaping it into a one-dimensional tensor. + * If `start_dim` or `end_dim` are passed, only dimensions starting with `start_dim` + * and ending with `end_dim` are flattened. The order of elements in input is unchanged. + * @param {number} start_dim the first dim to flatten + * @param {number} end_dim the last dim to flatten + * @returns The flattened tensor. + */ + flatten(start_dim = 0, end_dim = -1) { + return this.clone().flatten_(start_dim, end_dim); + } + /** + * Returns a new tensor with the same data as the `self` tensor but of a different `shape`. + * @param {...number} dims the desired size + * @returns {Tensor} The tensor with the same data but different shape + */ + view(...dims) { + // TODO: validate dims + let inferredIndex = -1; + for (let i = 0; i < dims.length; ++i) { + if (dims[i] === -1) { if (inferredIndex !== -1) { - // Some dimension must be inferred - const productOther = dims.reduce((product, curr, index) => { - return index !== inferredIndex ? product * curr : product - }, 1); - - dims[inferredIndex] = this.data.length / productOther; + throw new Error("Only one dimension can be inferred"); } - return new Tensor(this.type, this.data, dims); // NOTE: uses same underlying storage + inferredIndex = i; + } } - neg_() { - for (let i = 0; i < this.data.length; ++i) { - this.data[i] = -this.data[i]; - } - return this; - } - neg() { - return this.clone().neg_(); - } + if (inferredIndex !== -1) { + // Some dimension must be inferred + const productOther = dims.reduce((product, curr, index) => { + return index !== inferredIndex ? product * curr : product; + }, 1); - /** - * In-place version of @see {@link Tensor.clamp} - */ - clamp_(min, max) { - for (let i = 0; i < this.data.length; ++i) { - this.data[i] = Math.min(Math.max(this.data[i], min), max); - } - return this; + dims[inferredIndex] = this.data.length / productOther; } + return new Tensor(this.type, this.data, dims); // NOTE: uses same underlying storage + } - /** - * Clamps all elements in input into the range [ min, max ] - * @param {number} min lower-bound of the range to be clamped to - * @param {number} max upper-bound of the range to be clamped to - * @returns the output tensor. - */ - clamp(min, max) { - return this.clone().clamp_(min, max); + neg_() { + for (let i = 0; i < this.data.length; ++i) { + this.data[i] = -this.data[i]; } + return this; + } + neg() { + return this.clone().neg_(); + } - /** - * In-place version of @see {@link Tensor.round} - */ - round_() { - for (let i = 0; i < this.data.length; ++i) { - this.data[i] = Math.round(this.data[i]); - } - return this; + /** + * In-place version of @see {@link Tensor.clamp} + */ + clamp_(min, max) { + for (let i = 0; i < this.data.length; ++i) { + this.data[i] = Math.min(Math.max(this.data[i], min), max); } + return this; + } - /** - * Rounds elements of input to the nearest integer. - * @returns the output tensor. - */ - round() { - return this.clone().round_(); + /** + * Clamps all elements in input into the range [ min, max ] + * @param {number} min lower-bound of the range to be clamped to + * @param {number} max upper-bound of the range to be clamped to + * @returns the output tensor. + */ + clamp(min, max) { + return this.clone().clamp_(min, max); + } + + /** + * In-place version of @see {@link Tensor.round} + */ + round_() { + for (let i = 0; i < this.data.length; ++i) { + this.data[i] = Math.round(this.data[i]); } + return this; + } - /** - * Performs Tensor dtype conversion. - * @param {DataType} type The desired data type. - * @returns {Tensor} The converted tensor. - */ - to(type) { - // If the self Tensor already has the correct dtype, then self is returned. - if (this.type === type) return this; + /** + * Rounds elements of input to the nearest integer. + * @returns the output tensor. + */ + round() { + return this.clone().round_(); + } - // Otherwise, the returned tensor is a copy of self with the desired dtype. - if (!DataTypeMap.hasOwnProperty(type)) { - throw new Error(`Unsupported type: ${type}`); - } - // @ts-ignore - return new Tensor(type, DataTypeMap[type].from(this.data), this.dims); + /** + * Performs Tensor dtype conversion. + * @param {DataType} type The desired data type. + * @returns {Tensor} The converted tensor. + */ + to(type) { + // If the self Tensor already has the correct dtype, then self is returned. + if (this.type === type) return this; + + // Otherwise, the returned tensor is a copy of self with the desired dtype. + if (!DataTypeMap.hasOwnProperty(type)) { + throw new Error(`Unsupported type: ${type}`); } + // @ts-ignore + return new Tensor(type, DataTypeMap[type].from(this.data), this.dims); + } } /** * This creates a nested array of a given type and depth (see examples). - * + * * @example * NestArray; // string[] * @example @@ -651,32 +653,36 @@ export class Tensor { * @returns {NestArray} The reshaped array. */ function reshape(data, dimensions) { + const totalElements = data.length; + const dimensionSize = dimensions.reduce((a, b) => a * b); - const totalElements = data.length; - const dimensionSize = dimensions.reduce((a, b) => a * b); + if (totalElements !== dimensionSize) { + throw Error( + `cannot reshape array of size ${totalElements} into shape (${dimensions})`, + ); + } - if (totalElements !== dimensionSize) { - throw Error(`cannot reshape array of size ${totalElements} into shape (${dimensions})`); - } + /** @type {any} */ + let reshapedArray = data; - /** @type {any} */ - let reshapedArray = data; + for (let i = dimensions.length - 1; i >= 0; i--) { + reshapedArray = reshapedArray.reduce( + (acc, val) => { + let lastArray = acc[acc.length - 1]; - for (let i = dimensions.length - 1; i >= 0; i--) { - reshapedArray = reshapedArray.reduce((acc, val) => { - let lastArray = acc[acc.length - 1]; + if (lastArray.length < dimensions[i]) { + lastArray.push(val); + } else { + acc.push([val]); + } - if (lastArray.length < dimensions[i]) { - lastArray.push(val); - } else { - acc.push([val]); - } + return acc; + }, + [[]], + ); + } - return acc; - }, [[]]); - } - - return reshapedArray[0]; + return reshapedArray[0]; } /** @@ -686,11 +692,14 @@ function reshape(data, dimensions) { * @returns {Tensor} The transposed tensor. */ export function transpose(tensor, axes) { - const [transposedData, shape] = transpose_data(tensor.data, tensor.dims, axes); - return new Tensor(tensor.type, transposedData, shape); + const [transposedData, shape] = transpose_data( + tensor.data, + tensor.dims, + axes, + ); + return new Tensor(tensor.type, transposedData, shape); } - /** * Interpolates an Tensor to the given size. * @param {Tensor} input The input tensor to interpolate. Data must be channel-first (i.e., [c, h, w]) @@ -699,21 +708,25 @@ export function transpose(tensor, axes) { * @param {boolean} align_corners Whether to align corners. * @returns {Tensor} The interpolated tensor. */ -export function interpolate(input, [out_height, out_width], mode = 'bilinear', align_corners = false) { +export function interpolate( + input, + [out_height, out_width], + mode = "bilinear", + align_corners = false, +) { + // Input image dimensions + const in_channels = input.dims.at(-3) ?? 1; + const in_height = input.dims.at(-2); + const in_width = input.dims.at(-1); - // Input image dimensions - const in_channels = input.dims.at(-3) ?? 1; - const in_height = input.dims.at(-2); - const in_width = input.dims.at(-1); - - let output = interpolate_data( - /** @type {import('./maths.js').TypedArray}*/(input.data), - [in_channels, in_height, in_width], - [out_height, out_width], - mode, - align_corners - ); - return new Tensor(input.type, output, [in_channels, out_height, out_width]); + let output = interpolate_data( + /** @type {import('./maths.js').TypedArray}*/ (input.data), + [in_channels, in_height, in_width], + [out_height, out_width], + mode, + align_corners, + ); + return new Tensor(input.type, output, [in_channels, out_height, out_width]); } /** @@ -723,43 +736,41 @@ export function interpolate(input, [out_height, out_width], mode = 'bilinear', a * @returns {Tensor} Returns a new Tensor of shape [batchSize, embedDim]. */ export function mean_pooling(last_hidden_state, attention_mask) { - // last_hidden_state: [batchSize, seqLength, embedDim] - // attention_mask: [batchSize, seqLength] + // last_hidden_state: [batchSize, seqLength, embedDim] + // attention_mask: [batchSize, seqLength] - let shape = [last_hidden_state.dims[0], last_hidden_state.dims[2]]; - // @ts-ignore - let returnedData = new last_hidden_state.data.constructor(shape[0] * shape[1]); - let [batchSize, seqLength, embedDim] = last_hidden_state.dims; + let shape = [last_hidden_state.dims[0], last_hidden_state.dims[2]]; + // @ts-ignore + let returnedData = new last_hidden_state.data.constructor( + shape[0] * shape[1], + ); + let [batchSize, seqLength, embedDim] = last_hidden_state.dims; - let outIndex = 0; - for (let i = 0; i < batchSize; ++i) { - let offset = i * embedDim * seqLength; + let outIndex = 0; + for (let i = 0; i < batchSize; ++i) { + let offset = i * embedDim * seqLength; - for (let k = 0; k < embedDim; ++k) { - let sum = 0; - let count = 0; + for (let k = 0; k < embedDim; ++k) { + let sum = 0; + let count = 0; - let attnMaskOffset = i * seqLength; - let offset2 = offset + k; - // Pool over all words in sequence - for (let j = 0; j < seqLength; ++j) { - // index into attention mask - let attn = Number(attention_mask.data[attnMaskOffset + j]); + let attnMaskOffset = i * seqLength; + let offset2 = offset + k; + // Pool over all words in sequence + for (let j = 0; j < seqLength; ++j) { + // index into attention mask + let attn = Number(attention_mask.data[attnMaskOffset + j]); - count += attn; - sum += last_hidden_state.data[offset2 + j * embedDim] * attn; - } + count += attn; + sum += last_hidden_state.data[offset2 + j * embedDim] * attn; + } - let avg = sum / count; - returnedData[outIndex++] = avg; - } + let avg = sum / count; + returnedData[outIndex++] = avg; } + } - return new Tensor( - last_hidden_state.type, - returnedData, - shape - ) + return new Tensor(last_hidden_state.type, returnedData, shape); } /** @@ -770,19 +781,19 @@ export function mean_pooling(last_hidden_state, attention_mask) { * @private */ function calc_squeeze_dims(dims, dim) { - dims = dims.slice(); - if (dim === null) { - dims = dims.filter((d) => d !== 1); - } else if (typeof dim === 'number') { - if (dims[dim] === 1) { - dims.splice(dim, 1); - } - } else if (Array.isArray(dim)) { - dims = dims.filter((x, i) => { - return x !== 1 || !dim.includes(i); - }); + dims = dims.slice(); + if (dim === null) { + dims = dims.filter((d) => d !== 1); + } else if (typeof dim === "number") { + if (dims[dim] === 1) { + dims.splice(dim, 1); } - return dims; + } else if (Array.isArray(dim)) { + dims = dims.filter((x, i) => { + return x !== 1 || !dim.includes(i); + }); + } + return dims; } /** @@ -793,13 +804,13 @@ function calc_squeeze_dims(dims, dim) { * @private */ function calc_unsqueeze_dims(dims, dim) { - // Dimension out of range (e.g., "expected to be in range of [-4, 3], but got 4") - // + 1 since we allow inserting at the end (i.e. dim = -1) - dim = safeIndex(dim, dims.length + 1); - dims = dims.slice(); - // Insert 1 into specified dimension - dims.splice(dim, 0, 1); - return dims; + // Dimension out of range (e.g., "expected to be in range of [-4, 3], but got 4") + // + 1 since we allow inserting at the end (i.e. dim = -1) + dim = safeIndex(dim, dims.length + 1); + dims = dims.slice(); + // Insert 1 into specified dimension + dims.splice(dim, 0, 1); + return dims; } /** @@ -808,20 +819,22 @@ function calc_unsqueeze_dims(dims, dim) { * @param {number} size The size of the array. * @param {number} [dimension=null] The dimension that the index is for (optional). * @returns {number} The index, guaranteed to be non-negative and less than `arrayLength`. - * + * * @throws {Error} If the index is out of range. * @private */ function safeIndex(index, size, dimension = null) { - if (index < -size || index >= size) { - throw new Error(`IndexError: index ${index} is out of bounds for dimension${dimension === null ? '' : ' ' + dimension} with size ${size}`); - } + if (index < -size || index >= size) { + throw new Error( + `IndexError: index ${index} is out of bounds for dimension${dimension === null ? "" : " " + dimension} with size ${size}`, + ); + } - if (index < 0) { - // Negative indexing, ensuring positive index - index = ((index % size) + size) % size; - } - return index; + if (index < 0) { + // Negative indexing, ensuring positive index + index = ((index % size) + size) % size; + } + return index; } /** @@ -831,60 +844,62 @@ function safeIndex(index, size, dimension = null) { * @returns {Tensor} The concatenated tensor. */ export function cat(tensors, dim = 0) { - dim = safeIndex(dim, tensors[0].dims.length); + dim = safeIndex(dim, tensors[0].dims.length); - // TODO do validation of shapes + // TODO do validation of shapes - const resultDims = tensors[0].dims.slice(); - resultDims[dim] = tensors.reduce((a, b) => a + b.dims[dim], 0); + const resultDims = tensors[0].dims.slice(); + resultDims[dim] = tensors.reduce((a, b) => a + b.dims[dim], 0); - // Create a new array to store the accumulated values - const resultSize = resultDims.reduce((a, b) => a * b, 1); - // @ts-ignore - const result = new tensors[0].data.constructor(resultSize); + // Create a new array to store the accumulated values + const resultSize = resultDims.reduce((a, b) => a * b, 1); + // @ts-ignore + const result = new tensors[0].data.constructor(resultSize); - // Create output tensor of same type as first - const resultType = tensors[0].type; + // Create output tensor of same type as first + const resultType = tensors[0].type; - if (dim === 0) { - // Handle special case for performance reasons + if (dim === 0) { + // Handle special case for performance reasons - let offset = 0; - for (let t of tensors) { - result.set(t.data, offset); - offset += t.data.length; - } - - } else { - - let currentDim = 0; - - for (let t = 0; t < tensors.length; ++t) { - let tensor = tensors[t]; - - // Iterate over the data array - for (let i = 0; i < tensor.data.length; ++i) { - // Calculate the index in the resulting array - let resultIndex = 0; - - for (let j = tensor.dims.length - 1, num = i, resultMultiplier = 1; j >= 0; --j) { - const size = tensor.dims[j]; - let index = num % size; - if (j === dim) { - index += currentDim; - } - resultIndex += index * resultMultiplier; - resultMultiplier *= resultDims[j]; - num = Math.floor(num / size); - } - // Accumulate the value at the current index - result[resultIndex] = tensor.data[i]; - } - - currentDim += tensor.dims[dim]; - } + let offset = 0; + for (let t of tensors) { + result.set(t.data, offset); + offset += t.data.length; } - return new Tensor(resultType, result, resultDims); + } else { + let currentDim = 0; + + for (let t = 0; t < tensors.length; ++t) { + let tensor = tensors[t]; + + // Iterate over the data array + for (let i = 0; i < tensor.data.length; ++i) { + // Calculate the index in the resulting array + let resultIndex = 0; + + for ( + let j = tensor.dims.length - 1, num = i, resultMultiplier = 1; + j >= 0; + --j + ) { + const size = tensor.dims[j]; + let index = num % size; + if (j === dim) { + index += currentDim; + } + resultIndex += index * resultMultiplier; + resultMultiplier *= resultDims[j]; + num = Math.floor(num / size); + } + // Accumulate the value at the current index + result[resultIndex] = tensor.data[i]; + } + + currentDim += tensor.dims[dim]; + } + } + return new Tensor(resultType, result, resultDims); } /** @@ -894,12 +909,14 @@ export function cat(tensors, dim = 0) { * @returns {Tensor} The stacked tensor. */ export function stack(tensors, dim = 0) { - // TODO do validation of shapes - // NOTE: stack expects each tensor to be equal size - return cat(tensors.map(t => t.unsqueeze(dim)), dim); + // TODO do validation of shapes + // NOTE: stack expects each tensor to be equal size + return cat( + tensors.map((t) => t.unsqueeze(dim)), + dim, + ); } - /** * Calculates the standard deviation and mean over the dimensions specified by dim. dim can be a single dimension or `null` to reduce over all dimensions. * @param {Tensor} input the input tenso @@ -909,67 +926,85 @@ export function stack(tensors, dim = 0) { * @returns {Tensor[]} A tuple of (std, mean) tensors. */ export function std_mean(input, dim = null, correction = 1, keepdim = false) { - - if (dim === null) { - // None to reduce over all dimensions. - // @ts-ignore - const sum = input.data.reduce((a, b) => a + b, 0); - const mean = sum / input.data.length; - // @ts-ignore - const std = Math.sqrt(input.data.reduce((a, b) => a + (b - mean) ** 2, 0) / (input.data.length - correction)); - - const meanTensor = new Tensor(input.type, [mean], [/* scalar */]); - const stdTensor = new Tensor(input.type, [std], [/* scalar */]); - - return [stdTensor, meanTensor]; - } - - // Negative indexing - dim = safeIndex(dim, input.dims.length); - - const meanTensor = mean(input, dim, keepdim); - - // Calculate the shape of the resulting array after summation - const resultDims = input.dims.slice(); // Copy the original dimensions - resultDims[dim] = 1; // Remove the specified axis - - // Create a new array to store the accumulated values + if (dim === null) { + // None to reduce over all dimensions. // @ts-ignore - const result = new input.data.constructor(input.data.length / input.dims[dim]); + const sum = input.data.reduce((a, b) => a + b, 0); + const mean = sum / input.data.length; + // @ts-ignore + const std = Math.sqrt( + input.data.reduce((a, b) => a + (b - mean) ** 2, 0) / + (input.data.length - correction), + ); - // Iterate over the data array - for (let i = 0; i < input.data.length; ++i) { - - // Calculate the index in the resulting array - let resultIndex = 0; - - for (let j = input.dims.length - 1, num = i, resultMultiplier = 1; j >= 0; --j) { - const size = input.dims[j]; - if (j !== dim) { - const index = num % size; - resultIndex += index * resultMultiplier; - resultMultiplier *= resultDims[j]; - } - num = Math.floor(num / size); - } - - // Accumulate the value at the current index - result[resultIndex] += (input.data[i] - meanTensor.data[resultIndex]) ** 2; - } - - for (let i = 0; i < result.length; ++i) { - result[i] = Math.sqrt(result[i] / (input.dims[dim] - correction)); - } - - if (!keepdim) { - resultDims.splice(dim, 1); - } - - const stdTensor = new Tensor(input.type, result, resultDims); + const meanTensor = new Tensor( + input.type, + [mean], + [ + /* scalar */ + ], + ); + const stdTensor = new Tensor( + input.type, + [std], + [ + /* scalar */ + ], + ); return [stdTensor, meanTensor]; -} + } + // Negative indexing + dim = safeIndex(dim, input.dims.length); + + const meanTensor = mean(input, dim, keepdim); + + // Calculate the shape of the resulting array after summation + const resultDims = input.dims.slice(); // Copy the original dimensions + resultDims[dim] = 1; // Remove the specified axis + + // Create a new array to store the accumulated values + // @ts-ignore + const result = new input.data.constructor( + input.data.length / input.dims[dim], + ); + + // Iterate over the data array + for (let i = 0; i < input.data.length; ++i) { + // Calculate the index in the resulting array + let resultIndex = 0; + + for ( + let j = input.dims.length - 1, num = i, resultMultiplier = 1; + j >= 0; + --j + ) { + const size = input.dims[j]; + if (j !== dim) { + const index = num % size; + resultIndex += index * resultMultiplier; + resultMultiplier *= resultDims[j]; + } + num = Math.floor(num / size); + } + + // Accumulate the value at the current index + result[resultIndex] += (input.data[i] - meanTensor.data[resultIndex]) ** 2; + } + + for (let i = 0; i < result.length; ++i) { + result[i] = Math.sqrt(result[i] / (input.dims[dim] - correction)); + } + + if (!keepdim) { + resultDims.splice(dim, 1); + } + + const stdTensor = new Tensor(input.type, result, resultDims); + + return [stdTensor, meanTensor]; +} /** * Returns the mean value of each row of the input tensor in the given dimension dim. @@ -979,159 +1014,168 @@ export function std_mean(input, dim = null, correction = 1, keepdim = false) { * @returns A new tensor with means taken along the specified dimension. */ export function mean(input, dim = null, keepdim = false) { - - if (dim === null) { - // None to reduce over all dimensions. - // @ts-ignore - let val = input.data.reduce((a, b) => a + b, 0); - return new Tensor(input.type, [val / input.data.length], [/* scalar */]); - } - - // Negative indexing - dim = safeIndex(dim, input.dims.length); - - // Calculate the shape of the resulting array after summation - const resultDims = input.dims.slice(); // Copy the original dimensions - resultDims[dim] = 1; // Remove the specified axis - - // Create a new array to store the accumulated values + if (dim === null) { + // None to reduce over all dimensions. // @ts-ignore - const result = new input.data.constructor(input.data.length / input.dims[dim]); + let val = input.data.reduce((a, b) => a + b, 0); + return new Tensor( + input.type, + [val / input.data.length], + [ + /* scalar */ + ], + ); + } - // Iterate over the data array - for (let i = 0; i < input.data.length; ++i) { + // Negative indexing + dim = safeIndex(dim, input.dims.length); - // Calculate the index in the resulting array - let resultIndex = 0; + // Calculate the shape of the resulting array after summation + const resultDims = input.dims.slice(); // Copy the original dimensions + resultDims[dim] = 1; // Remove the specified axis - for (let j = input.dims.length - 1, num = i, resultMultiplier = 1; j >= 0; --j) { - const size = input.dims[j]; - if (j !== dim) { - const index = num % size; - resultIndex += index * resultMultiplier; - resultMultiplier *= resultDims[j]; - } - num = Math.floor(num / size); - } + // Create a new array to store the accumulated values + // @ts-ignore + const result = new input.data.constructor( + input.data.length / input.dims[dim], + ); - // Accumulate the value at the current index - result[resultIndex] += input.data[i]; + // Iterate over the data array + for (let i = 0; i < input.data.length; ++i) { + // Calculate the index in the resulting array + let resultIndex = 0; + + for ( + let j = input.dims.length - 1, num = i, resultMultiplier = 1; + j >= 0; + --j + ) { + const size = input.dims[j]; + if (j !== dim) { + const index = num % size; + resultIndex += index * resultMultiplier; + resultMultiplier *= resultDims[j]; + } + num = Math.floor(num / size); } - if (input.dims[dim] !== 1) { - for (let i = 0; i < result.length; ++i) { - result[i] = result[i] / input.dims[dim]; - } - } + // Accumulate the value at the current index + result[resultIndex] += input.data[i]; + } - if (!keepdim) { - resultDims.splice(dim, 1); + if (input.dims[dim] !== 1) { + for (let i = 0; i < result.length; ++i) { + result[i] = result[i] / input.dims[dim]; } + } - return new Tensor(input.type, result, resultDims); + if (!keepdim) { + resultDims.splice(dim, 1); + } + + return new Tensor(input.type, result, resultDims); } - /** * * Measures similarity between two temporal sequences (e.g., input audio and output tokens * to generate token-level timestamps). - * @param {Tensor} matrix + * @param {Tensor} matrix * @returns {number[][]} */ export function dynamicTimeWarping(matrix) { - const [output_length, input_length] = matrix.dims; + const [output_length, input_length] = matrix.dims; - const outputShape = [output_length + 1, input_length + 1]; + const outputShape = [output_length + 1, input_length + 1]; - const cost = new Tensor( - 'float32', - new Float32Array(outputShape[0] * outputShape[1]).fill(Infinity), - outputShape - ); + const cost = new Tensor( + "float32", + new Float32Array(outputShape[0] * outputShape[1]).fill(Infinity), + outputShape, + ); - const trace = new Tensor( - 'float32', - new Float32Array(outputShape[0] * outputShape[1]).fill(-1), - outputShape - ) + const trace = new Tensor( + "float32", + new Float32Array(outputShape[0] * outputShape[1]).fill(-1), + outputShape, + ); - // same as `cost[0][0] = 0`; - cost[0].data[0] = 0; + // same as `cost[0][0] = 0`; + cost[0].data[0] = 0; - for (let j = 1; j < input_length + 1; ++j) { - for (let i = 1; i < output_length + 1; ++i) { + for (let j = 1; j < input_length + 1; ++j) { + for (let i = 1; i < output_length + 1; ++i) { + const c0 = cost[i - 1][j - 1].item(); + const c1 = cost[i - 1][j].item(); + const c2 = cost[i][j - 1].item(); - const c0 = cost[i - 1][j - 1].item(); - const c1 = cost[i - 1][j].item(); - const c2 = cost[i][j - 1].item(); + let c, t; + if (c0 < c1 && c0 < c2) { + c = c0; + t = 0; + } else if (c1 < c0 && c1 < c2) { + c = c1; + t = 1; + } else { + c = c2; + t = 2; + } - let c, t; - if (c0 < c1 && c0 < c2) { - c = c0; - t = 0; - } else if (c1 < c0 && c1 < c2) { - c = c1; - t = 1; - } else { - c = c2; - t = 2; - } - - cost[i].data[j] = matrix[i - 1][j - 1].item() + c; - trace[i].data[j] = t; - } + cost[i].data[j] = matrix[i - 1][j - 1].item() + c; + trace[i].data[j] = t; } + } - // backtrace - let i = output_length; - let j = input_length; + // backtrace + let i = output_length; + let j = input_length; - // @ts-ignore - trace.data.fill(2, 0, outputShape[1]) // trace[0, :] = 2 - for (let i = 0; i < outputShape[0]; ++i) { // trace[:, 0] = 1 - trace[i].data[0] = 1; + // @ts-ignore + trace.data.fill(2, 0, outputShape[1]); // trace[0, :] = 2 + for (let i = 0; i < outputShape[0]; ++i) { + // trace[:, 0] = 1 + trace[i].data[0] = 1; + } + + let text_indices = []; + let time_indices = []; + + while (i > 0 || j > 0) { + text_indices.push(i - 1); + time_indices.push(j - 1); + + const t = trace[i][j].item(); + switch (t) { + case 0: + --i; + --j; + break; + case 1: + --i; + break; + case 2: + --j; + break; + default: + throw new Error( + `Internal error in dynamic time warping. Unexpected trace[${i}, ${j}]. Please file a bug report.`, + ); } + } - let text_indices = []; - let time_indices = []; - - while (i > 0 || j > 0) { - text_indices.push(i - 1); - time_indices.push(j - 1); - - const t = trace[i][j].item(); - switch (t) { - case 0: - --i; --j; - break; - case 1: - --i; - break; - case 2: - --j; - break; - default: - throw new Error( - `Internal error in dynamic time warping. Unexpected trace[${i}, ${j}]. Please file a bug report.` - ) - } - } - - text_indices.reverse(); - time_indices.reverse(); - - return [text_indices, time_indices]; + text_indices.reverse(); + time_indices.reverse(); + return [text_indices, time_indices]; } function dimsToStride(dims) { - const stride = new Array(dims.length); - for (let i = dims.length - 1, s2 = 1; i >= 0; --i) { - stride[i] = s2; - s2 *= dims[i]; - } - return stride; + const stride = new Array(dims.length); + for (let i = dims.length - 1, s2 = 1; i >= 0; --i) { + stride[i] = s2; + s2 *= dims[i]; + } + return stride; } /** @@ -1139,12 +1183,8 @@ function dimsToStride(dims) { * @param {number[]} size A sequence of integers defining the shape of the output tensor. */ export function ones(size) { - const numElements = size.reduce((a, b) => a * b, 1); - return new Tensor( - 'int64', - new BigInt64Array(numElements).fill(1n), - size - ) + const numElements = size.reduce((a, b) => a * b, 1); + return new Tensor("int64", new BigInt64Array(numElements).fill(1n), size); } /** @@ -1153,5 +1193,5 @@ export function ones(size) { * @returns The ones tensor. */ export function ones_like(tensor) { - return ones(tensor.dims); + return ones(tensor.dims); } diff --git a/core/vendor/modules/@xenova/transformers/types/backends/onnx.d.ts b/core/vendor/modules/@xenova/transformers/types/backends/onnx.d.ts index a287ca200..3ec66af5e 100644 --- a/core/vendor/modules/@xenova/transformers/types/backends/onnx.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/backends/onnx.d.ts @@ -1,5 +1,5 @@ /** @type {import('onnxruntime-web')} The ONNX runtime module. */ export let ONNX: typeof ONNX_WEB; export const executionProviders: string[]; -import * as ONNX_WEB from 'onnxruntime-web'; -//# sourceMappingURL=onnx.d.ts.map \ No newline at end of file +import * as ONNX_WEB from "onnxruntime-web"; +//# sourceMappingURL=onnx.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/configs.d.ts b/core/vendor/modules/@xenova/transformers/types/configs.d.ts index fa3989912..1b6d012a0 100644 --- a/core/vendor/modules/@xenova/transformers/types/configs.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/configs.d.ts @@ -3,23 +3,32 @@ * [Python documentation](https://huggingface.co/docs/transformers/main/en/main_classes/configuration#transformers.PretrainedConfig). */ export class PretrainedConfig { - /** - * Loads a pre-trained config from the given `pretrained_model_name_or_path`. - * - * @param {string} pretrained_model_name_or_path The path to the pre-trained config. - * @param {PretrainedOptions} options Additional options for loading the config. - * @throws {Error} Throws an error if the config.json is not found in the `pretrained_model_name_or_path`. - * - * @returns {Promise} A new instance of the `PretrainedConfig` class. - */ - static from_pretrained(pretrained_model_name_or_path: string, { progress_callback, config, cache_dir, local_files_only, revision, }?: PretrainedOptions): Promise; - /** - * Create a new PreTrainedTokenizer instance. - * @param {Object} configJSON The JSON of the config. - */ - constructor(configJSON: any); - model_type: any; - is_encoder_decoder: boolean; + /** + * Loads a pre-trained config from the given `pretrained_model_name_or_path`. + * + * @param {string} pretrained_model_name_or_path The path to the pre-trained config. + * @param {PretrainedOptions} options Additional options for loading the config. + * @throws {Error} Throws an error if the config.json is not found in the `pretrained_model_name_or_path`. + * + * @returns {Promise} A new instance of the `PretrainedConfig` class. + */ + static from_pretrained( + pretrained_model_name_or_path: string, + { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + }?: PretrainedOptions, + ): Promise; + /** + * Create a new PreTrainedTokenizer instance. + * @param {Object} configJSON The JSON of the config. + */ + constructor(configJSON: any); + model_type: any; + is_encoder_decoder: boolean; } /** * Helper class which is used to instantiate pretrained configs with the `from_pretrained` function. @@ -28,16 +37,25 @@ export class PretrainedConfig { * let config = await AutoConfig.from_pretrained('bert-base-uncased'); */ export class AutoConfig { - /** - * Loads a pre-trained config from the given `pretrained_model_name_or_path`. - * - * @param {string} pretrained_model_name_or_path The path to the pre-trained config. - * @param {PretrainedOptions} options Additional options for loading the config. - * @throws {Error} Throws an error if the config.json is not found in the `pretrained_model_name_or_path`. - * - * @returns {Promise} A new instance of the `PretrainedConfig` class. - */ - static from_pretrained(pretrained_model_name_or_path: string, { progress_callback, config, cache_dir, local_files_only, revision, }?: import("./utils/hub.js").PretrainedOptions): Promise; + /** + * Loads a pre-trained config from the given `pretrained_model_name_or_path`. + * + * @param {string} pretrained_model_name_or_path The path to the pre-trained config. + * @param {PretrainedOptions} options Additional options for loading the config. + * @throws {Error} Throws an error if the config.json is not found in the `pretrained_model_name_or_path`. + * + * @returns {Promise} A new instance of the `PretrainedConfig` class. + */ + static from_pretrained( + pretrained_model_name_or_path: string, + { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + }?: import("./utils/hub.js").PretrainedOptions, + ): Promise; } -export type PretrainedOptions = import('./utils/hub.js').PretrainedOptions; -//# sourceMappingURL=configs.d.ts.map \ No newline at end of file +export type PretrainedOptions = import("./utils/hub.js").PretrainedOptions; +//# sourceMappingURL=configs.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/env.d.ts b/core/vendor/modules/@xenova/transformers/types/env.d.ts index bfba7425b..a2d0cd447 100644 --- a/core/vendor/modules/@xenova/transformers/types/env.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/env.d.ts @@ -1,21 +1,21 @@ export namespace env { - export namespace backends { - export { onnx_env as onnx }; - export let tfjs: {}; - } - export { __dirname }; - export { VERSION as version }; - export let allowRemoteModels: boolean; - export let remoteHost: string; - export let remotePathTemplate: string; - export let allowLocalModels: boolean; - export { localModelPath }; - export { FS_AVAILABLE as useFS }; - export { WEB_CACHE_AVAILABLE as useBrowserCache }; - export { FS_AVAILABLE as useFSCache }; - export { DEFAULT_CACHE_DIR as cacheDir }; - export let useCustomCache: boolean; - export let customCache: any; + export namespace backends { + export { onnx_env as onnx }; + export let tfjs: {}; + } + export { __dirname }; + export { VERSION as version }; + export let allowRemoteModels: boolean; + export let remoteHost: string; + export let remotePathTemplate: string; + export let allowLocalModels: boolean; + export { localModelPath }; + export { FS_AVAILABLE as useFS }; + export { WEB_CACHE_AVAILABLE as useBrowserCache }; + export { FS_AVAILABLE as useFSCache }; + export { DEFAULT_CACHE_DIR as cacheDir }; + export let useCustomCache: boolean; + export let customCache: any; } declare const onnx_env: import("onnxruntime-common").Env; declare const __dirname: any; @@ -25,4 +25,4 @@ declare const FS_AVAILABLE: boolean; declare const WEB_CACHE_AVAILABLE: boolean; declare const DEFAULT_CACHE_DIR: any; export {}; -//# sourceMappingURL=env.d.ts.map \ No newline at end of file +//# sourceMappingURL=env.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/models.d.ts b/core/vendor/modules/@xenova/transformers/types/models.d.ts index 380a4702b..e95de5739 100644 --- a/core/vendor/modules/@xenova/transformers/types/models.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/models.d.ts @@ -1,1318 +1,1368 @@ declare const PreTrainedModel_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * A base class for pre-trained models that provides the model configuration and an ONNX session. */ export class PreTrainedModel extends PreTrainedModel_base { - /** - * Instantiate one of the model classes of the library from a pretrained model. - * - * The model class to instantiate is selected based on the `model_type` property of the config object - * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) - * - * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: - * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a - * user or organization name, like `dbmdz/bert-base-german-cased`. - * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`. - * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the model. - * - * @returns {Promise} A new instance of the `PreTrainedModel` class. - */ - static from_pretrained(pretrained_model_name_or_path: string, { quantized, progress_callback, config, cache_dir, local_files_only, revision, model_file_name, }?: import('./utils/hub.js').PretrainedOptions): Promise; - /** - * Creates a new instance of the `PreTrainedModel` class. - * @param {Object} config The model configuration. - * @param {any} session session for the model. - */ - constructor(config: any, session: any); - main_input_name: string; - config: any; - session: any; - can_generate: boolean; - _runBeam: typeof decoderRunBeam; - _getStartBeams: typeof decoderStartBeams; - _updateBeam: typeof decoderUpdatebeam; - _forward: typeof encoderForward; - /** - * Disposes of all the ONNX sessions that were created during inference. - * @returns {Promise} An array of promises, one for each ONNX session that is being disposed. - * @todo Use https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/FinalizationRegistry - */ - dispose(): Promise; - /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Object containing input tensors - * @returns {Promise} Object containing output tensors - */ - _call(model_inputs: any): Promise; - /** - * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method - * will be chosen based on the model type. - * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model. - * @returns {Promise} The output data from the model in the format specified in the ONNX model. - * @throws {Error} This method must be implemented in subclasses. - */ - forward(model_inputs: any): Promise; - /** - * @param {import('./utils/generation.js').GenerationConfigType} generation_config - * @param {number} input_ids_seq_length The starting sequence length for the input ids. - * @returns {LogitsProcessorList} - * @private - */ - private _get_logits_processor; - /** - * This function merges multiple generation configs together to form a final generation config to be used by the model for text generation. - * It first creates an empty `GenerationConfig` object, then it applies the model's own `generation_config` property to it. Finally, if a `generation_config` object was passed in the arguments, it overwrites the corresponding properties in the final config with those of the passed config object. - * @param {import('./utils/generation.js').GenerationConfigType} generation_config A `GenerationConfig` object containing generation parameters. - * @returns {import('./utils/generation.js').GenerationConfigType} The final generation config object to be used by the model for text generation. - */ - _get_generation_config(generation_config: import('./utils/generation.js').GenerationConfigType): import('./utils/generation.js').GenerationConfigType; - /** - * @typedef {import('./utils/maths.js').TypedArray} TypedArray - */ - /** - * @typedef {{ sequences: Tensor, decoder_attentions: Tensor, cross_attentions: Tensor }} EncoderDecoderOutput - * @typedef {Object} DecoderOutput - * - * Generates text based on the given inputs and generation configuration using the model. - * @param {Tensor|Array|TypedArray} inputs An array of input token IDs. - * @param {Object|GenerationConfig|null} generation_config The generation configuration to use. If null, default configuration will be used. - * @param {Object|null} logits_processor An optional logits processor to use. If null, a new LogitsProcessorList instance will be created. - * @param {Object} options options - * @param {Object} [options.inputs_attention_mask=null] An optional attention mask for the inputs. - * @returns {Promise} An array of generated output sequences, where each sequence is an array of token IDs. - * @throws {Error} Throws an error if the inputs array is empty. - */ - generate(inputs: any[] | import("./transformers.js").TypedArray | Tensor, generation_config?: any | (new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType) | null, logits_processor?: any | null, { inputs_attention_mask }?: { - inputs_attention_mask?: any; - }): Promise; - /** - * Helper function to add attentions to beam - * @param {Object} beam - * @param {Object} output - * @private - */ - private addAttentionsToBeam; - /** - * Groups an array of beam objects by their ids. - * - * @param {Array} beams The array of beam objects to group. - * @returns {Array} An array of arrays, where each inner array contains beam objects with the same id. - */ - groupBeams(beams: any[]): any[]; - /** - * Returns an object containing past key values from the given decoder results object. - * - * @param {Object} decoderResults The decoder results object. - * @param {Object} pastKeyValues The previous past key values. - * @returns {Object} An object containing past key values. - */ - getPastKeyValues(decoderResults: any, pastKeyValues: any): any; - /** - * Returns an object containing attentions from the given decoder results object. - * - * @param {Object} decoderResults The decoder results object. - * @returns {Object} An object containing attentions. - */ - getAttentions(decoderResults: any): any; - /** - * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values. - * - * @param {Object} decoderFeeds The decoder feeds object to add past key values to. - * @param {Object} pastKeyValues An object containing past key values. - */ - addPastKeyValues(decoderFeeds: any, pastKeyValues: any): void; - /** - * Initializes and returns the beam for text generation task - * @param {Tensor} inputTokenIds The input token ids. - * @param {Object} generation_config The generation config. - * @param {number} numOutputTokens The number of tokens to be generated. - * @param {Tensor} inputs_attention_mask Optional input attention mask. - * @returns {any} A Beam object representing the initialized beam. - * @private - */ - private getStartBeams; - /** - * Runs a single step of the beam search generation algorithm. - * @param {any} beam The current beam being generated. - * @returns {Promise} The updated beam after a single generation step. - * @private - */ - private runBeam; - /** - * Update a beam with a new token ID. - * @param {Object} beam The beam to update. - * @param {number} newTokenId The new token ID to add to the beam's output. - * @private - */ - private updateBeam; -} -export class ModelOutput { + /** + * Instantiate one of the model classes of the library from a pretrained model. + * + * The model class to instantiate is selected based on the `model_type` property of the config object + * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) + * + * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: + * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + * user or organization name, like `dbmdz/bert-base-german-cased`. + * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`. + * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the model. + * + * @returns {Promise} A new instance of the `PreTrainedModel` class. + */ + static from_pretrained( + pretrained_model_name_or_path: string, + { + quantized, + progress_callback, + config, + cache_dir, + local_files_only, + revision, + model_file_name, + }?: import("./utils/hub.js").PretrainedOptions, + ): Promise; + /** + * Creates a new instance of the `PreTrainedModel` class. + * @param {Object} config The model configuration. + * @param {any} session session for the model. + */ + constructor(config: any, session: any); + main_input_name: string; + config: any; + session: any; + can_generate: boolean; + _runBeam: typeof decoderRunBeam; + _getStartBeams: typeof decoderStartBeams; + _updateBeam: typeof decoderUpdatebeam; + _forward: typeof encoderForward; + /** + * Disposes of all the ONNX sessions that were created during inference. + * @returns {Promise} An array of promises, one for each ONNX session that is being disposed. + * @todo Use https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/FinalizationRegistry + */ + dispose(): Promise; + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Object containing input tensors + * @returns {Promise} Object containing output tensors + */ + _call(model_inputs: any): Promise; + /** + * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method + * will be chosen based on the model type. + * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model. + * @returns {Promise} The output data from the model in the format specified in the ONNX model. + * @throws {Error} This method must be implemented in subclasses. + */ + forward(model_inputs: any): Promise; + /** + * @param {import('./utils/generation.js').GenerationConfigType} generation_config + * @param {number} input_ids_seq_length The starting sequence length for the input ids. + * @returns {LogitsProcessorList} + * @private + */ + private _get_logits_processor; + /** + * This function merges multiple generation configs together to form a final generation config to be used by the model for text generation. + * It first creates an empty `GenerationConfig` object, then it applies the model's own `generation_config` property to it. Finally, if a `generation_config` object was passed in the arguments, it overwrites the corresponding properties in the final config with those of the passed config object. + * @param {import('./utils/generation.js').GenerationConfigType} generation_config A `GenerationConfig` object containing generation parameters. + * @returns {import('./utils/generation.js').GenerationConfigType} The final generation config object to be used by the model for text generation. + */ + _get_generation_config( + generation_config: import("./utils/generation.js").GenerationConfigType, + ): import("./utils/generation.js").GenerationConfigType; + /** + * @typedef {import('./utils/maths.js').TypedArray} TypedArray + */ + /** + * @typedef {{ sequences: Tensor, decoder_attentions: Tensor, cross_attentions: Tensor }} EncoderDecoderOutput + * @typedef {Object} DecoderOutput + * + * Generates text based on the given inputs and generation configuration using the model. + * @param {Tensor|Array|TypedArray} inputs An array of input token IDs. + * @param {Object|GenerationConfig|null} generation_config The generation configuration to use. If null, default configuration will be used. + * @param {Object|null} logits_processor An optional logits processor to use. If null, a new LogitsProcessorList instance will be created. + * @param {Object} options options + * @param {Object} [options.inputs_attention_mask=null] An optional attention mask for the inputs. + * @returns {Promise} An array of generated output sequences, where each sequence is an array of token IDs. + * @throws {Error} Throws an error if the inputs array is empty. + */ + generate( + inputs: any[] | import("./transformers.js").TypedArray | Tensor, + generation_config?: + | any + | (new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType) + | null, + logits_processor?: any | null, + { + inputs_attention_mask, + }?: { + inputs_attention_mask?: any; + }, + ): Promise; + /** + * Helper function to add attentions to beam + * @param {Object} beam + * @param {Object} output + * @private + */ + private addAttentionsToBeam; + /** + * Groups an array of beam objects by their ids. + * + * @param {Array} beams The array of beam objects to group. + * @returns {Array} An array of arrays, where each inner array contains beam objects with the same id. + */ + groupBeams(beams: any[]): any[]; + /** + * Returns an object containing past key values from the given decoder results object. + * + * @param {Object} decoderResults The decoder results object. + * @param {Object} pastKeyValues The previous past key values. + * @returns {Object} An object containing past key values. + */ + getPastKeyValues(decoderResults: any, pastKeyValues: any): any; + /** + * Returns an object containing attentions from the given decoder results object. + * + * @param {Object} decoderResults The decoder results object. + * @returns {Object} An object containing attentions. + */ + getAttentions(decoderResults: any): any; + /** + * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values. + * + * @param {Object} decoderFeeds The decoder feeds object to add past key values to. + * @param {Object} pastKeyValues An object containing past key values. + */ + addPastKeyValues(decoderFeeds: any, pastKeyValues: any): void; + /** + * Initializes and returns the beam for text generation task + * @param {Tensor} inputTokenIds The input token ids. + * @param {Object} generation_config The generation config. + * @param {number} numOutputTokens The number of tokens to be generated. + * @param {Tensor} inputs_attention_mask Optional input attention mask. + * @returns {any} A Beam object representing the initialized beam. + * @private + */ + private getStartBeams; + /** + * Runs a single step of the beam search generation algorithm. + * @param {any} beam The current beam being generated. + * @returns {Promise} The updated beam after a single generation step. + * @private + */ + private runBeam; + /** + * Update a beam with a new token ID. + * @param {Object} beam The beam to update. + * @param {number} newTokenId The new token ID to add to the beam's output. + * @private + */ + private updateBeam; } +export class ModelOutput {} /** * Base class for model's outputs, with potential hidden states and attentions. */ export class BaseModelOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.last_hidden_state Sequence of hidden-states at the output of the last layer of the model. - * @param {Tensor} [output.hidden_states] Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - * @param {Tensor} [output.attentions] Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - */ - constructor({ last_hidden_state, hidden_states, attentions }: { - last_hidden_state: Tensor; - hidden_states?: Tensor; - attentions?: Tensor; - }); + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.last_hidden_state Sequence of hidden-states at the output of the last layer of the model. + * @param {Tensor} [output.hidden_states] Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + * @param {Tensor} [output.attentions] Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + */ + constructor({ + last_hidden_state, + hidden_states, + attentions, + }: { last_hidden_state: Tensor; - hidden_states: Tensor; - attentions: Tensor; -} -export class BertPreTrainedModel extends PreTrainedModel { -} -export class BertModel extends BertPreTrainedModel { + hidden_states?: Tensor; + attentions?: Tensor; + }); + last_hidden_state: Tensor; + hidden_states: Tensor; + attentions: Tensor; } +export class BertPreTrainedModel extends PreTrainedModel {} +export class BertModel extends BertPreTrainedModel {} /** * BertForMaskedLM is a class representing a BERT model for masked language modeling. */ export class BertForMaskedLM extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + _call(model_inputs: any): Promise; } /** * BertForSequenceClassification is a class representing a BERT model for sequence classification. */ export class BertForSequenceClassification extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * BertForTokenClassification is a class representing a BERT model for token classification. */ export class BertForTokenClassification extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * BertForQuestionAnswering is a class representing a BERT model for question answering. */ export class BertForQuestionAnswering extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - _call(model_inputs: any): Promise; -} -export class RoFormerPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + _call(model_inputs: any): Promise; } +export class RoFormerPreTrainedModel extends PreTrainedModel {} /** * The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top. */ -export class RoFormerModel extends RoFormerPreTrainedModel { -} +export class RoFormerModel extends RoFormerPreTrainedModel {} /** * RoFormer Model with a `language modeling` head on top. */ export class RoFormerForMaskedLM extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + _call(model_inputs: any): Promise; } /** * RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class RoFormerForSequenceClassification extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) * e.g. for Named-Entity-Recognition (NER) tasks. */ export class RoFormerForTokenClassification extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). */ export class RoFormerForQuestionAnswering extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - _call(model_inputs: any): Promise; -} -export class ConvBertPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + _call(model_inputs: any): Promise; } +export class ConvBertPreTrainedModel extends PreTrainedModel {} /** * The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top. */ -export class ConvBertModel extends ConvBertPreTrainedModel { -} +export class ConvBertModel extends ConvBertPreTrainedModel {} /** * ConvBERT Model with a language modeling head on top. */ export class ConvBertForMaskedLM extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + _call(model_inputs: any): Promise; } /** * ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class ConvBertForSequenceClassification extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) * e.g. for Named-Entity-Recognition (NER) tasks. */ export class ConvBertForTokenClassification extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`) */ export class ConvBertForQuestionAnswering extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - _call(model_inputs: any): Promise; -} -export class ElectraPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + _call(model_inputs: any): Promise; } +export class ElectraPreTrainedModel extends PreTrainedModel {} /** * The bare Electra Model transformer outputting raw hidden-states without any specific head on top. * Identical to the BERT model except that it uses an additional linear layer between the embedding * layer and the encoder if the hidden size and embedding size are different. */ -export class ElectraModel extends ElectraPreTrainedModel { -} +export class ElectraModel extends ElectraPreTrainedModel {} /** * Electra model with a language modeling head on top. */ export class ElectraForMaskedLM extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + _call(model_inputs: any): Promise; } /** * ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class ElectraForSequenceClassification extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * Electra model with a token classification head on top. */ export class ElectraForTokenClassification extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * LECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). */ export class ElectraForQuestionAnswering extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - _call(model_inputs: any): Promise; -} -export class CamembertPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + _call(model_inputs: any): Promise; } +export class CamembertPreTrainedModel extends PreTrainedModel {} /** * The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top. */ -export class CamembertModel extends CamembertPreTrainedModel { -} +export class CamembertModel extends CamembertPreTrainedModel {} /** * CamemBERT Model with a `language modeling` head on top. */ export class CamembertForMaskedLM extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + _call(model_inputs: any): Promise; } /** * CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. */ export class CamembertForSequenceClassification extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. */ export class CamembertForTokenClassification extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * CamemBERT Model with a span classification head on top for extractive question-answering tasks */ export class CamembertForQuestionAnswering extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - _call(model_inputs: any): Promise; -} -export class DebertaPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + _call(model_inputs: any): Promise; } +export class DebertaPreTrainedModel extends PreTrainedModel {} /** * The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top. */ -export class DebertaModel extends DebertaPreTrainedModel { -} +export class DebertaModel extends DebertaPreTrainedModel {} /** * DeBERTa Model with a `language modeling` head on top. */ export class DebertaForMaskedLM extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + _call(model_inputs: any): Promise; } /** * DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class DebertaForSequenceClassification extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. */ export class DebertaForTokenClassification extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear * layers on top of the hidden-states output to compute `span start logits` and `span end logits`). */ export class DebertaForQuestionAnswering extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - _call(model_inputs: any): Promise; -} -export class DebertaV2PreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + _call(model_inputs: any): Promise; } +export class DebertaV2PreTrainedModel extends PreTrainedModel {} /** * The bare DeBERTa-V2 Model transformer outputting raw hidden-states without any specific head on top. */ -export class DebertaV2Model extends DebertaV2PreTrainedModel { -} +export class DebertaV2Model extends DebertaV2PreTrainedModel {} /** * DeBERTa-V2 Model with a `language modeling` head on top. */ export class DebertaV2ForMaskedLM extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + _call(model_inputs: any): Promise; } /** * DeBERTa-V2 Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class DebertaV2ForSequenceClassification extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * DeBERTa-V2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. */ export class DebertaV2ForTokenClassification extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * DeBERTa-V2 Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear * layers on top of the hidden-states output to compute `span start logits` and `span end logits`). */ export class DebertaV2ForQuestionAnswering extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - _call(model_inputs: any): Promise; -} -export class DistilBertPreTrainedModel extends PreTrainedModel { -} -export class DistilBertModel extends DistilBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + _call(model_inputs: any): Promise; } +export class DistilBertPreTrainedModel extends PreTrainedModel {} +export class DistilBertModel extends DistilBertPreTrainedModel {} /** * DistilBertForSequenceClassification is a class representing a DistilBERT model for sequence classification. */ export class DistilBertForSequenceClassification extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * DistilBertForTokenClassification is a class representing a DistilBERT model for token classification. */ export class DistilBertForTokenClassification extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * DistilBertForQuestionAnswering is a class representing a DistilBERT model for question answering. */ export class DistilBertForQuestionAnswering extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + _call(model_inputs: any): Promise; } /** * DistilBertForMaskedLM is a class representing a DistilBERT model for masking task. */ export class DistilBertForMaskedLM extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; -} -export class EsmPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } +export class EsmPreTrainedModel extends PreTrainedModel {} /** * The bare ESM Model transformer outputting raw hidden-states without any specific head on top. */ -export class EsmModel extends EsmPreTrainedModel { -} +export class EsmModel extends EsmPreTrainedModel {} /** * ESM Model with a `language modeling` head on top. */ export class EsmForMaskedLM extends EsmPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + _call(model_inputs: any): Promise; } /** * ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class EsmForSequenceClassification extends EsmPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * ESM Model with a token classification head on top (a linear layer on top of the hidden-states output) * e.g. for Named-Entity-Recognition (NER) tasks. */ export class EsmForTokenClassification extends EsmPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; -} -export class MobileBertPreTrainedModel extends PreTrainedModel { -} -export class MobileBertModel extends MobileBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } +export class MobileBertPreTrainedModel extends PreTrainedModel {} +export class MobileBertModel extends MobileBertPreTrainedModel {} /** * MobileBertForMaskedLM is a class representing a MobileBERT model for masking task. */ export class MobileBertForMaskedLM extends MobileBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } /** * MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class MobileBertForSequenceClassification extends MobileBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } /** * MobileBert Model with a span classification head on top for extractive question-answering tasks */ export class MobileBertForQuestionAnswering extends MobileBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; -} -export class MPNetPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } +export class MPNetPreTrainedModel extends PreTrainedModel {} /** * The bare MPNet Model transformer outputting raw hidden-states without any specific head on top. */ -export class MPNetModel extends MPNetPreTrainedModel { -} +export class MPNetModel extends MPNetPreTrainedModel {} /** * MPNetForMaskedLM is a class representing a MPNet model for masked language modeling. */ export class MPNetForMaskedLM extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + _call(model_inputs: any): Promise; } /** * MPNetForSequenceClassification is a class representing a MPNet model for sequence classification. */ export class MPNetForSequenceClassification extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * MPNetForTokenClassification is a class representing a MPNet model for token classification. */ export class MPNetForTokenClassification extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * MPNetForQuestionAnswering is a class representing a MPNet model for question answering. */ export class MPNetForQuestionAnswering extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - _call(model_inputs: any): Promise; -} -export class SqueezeBertPreTrainedModel extends PreTrainedModel { -} -export class SqueezeBertModel extends SqueezeBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + _call(model_inputs: any): Promise; } +export class SqueezeBertPreTrainedModel extends PreTrainedModel {} +export class SqueezeBertModel extends SqueezeBertPreTrainedModel {} export class SqueezeBertForMaskedLM extends SqueezeBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } export class SqueezeBertForSequenceClassification extends SqueezeBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } export class SqueezeBertForQuestionAnswering extends SqueezeBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; -} -export class AlbertPreTrainedModel extends PreTrainedModel { -} -export class AlbertModel extends AlbertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } +export class AlbertPreTrainedModel extends PreTrainedModel {} +export class AlbertModel extends AlbertPreTrainedModel {} export class AlbertForSequenceClassification extends AlbertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } export class AlbertForQuestionAnswering extends AlbertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } export class AlbertForMaskedLM extends AlbertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; -} -export class T5PreTrainedModel extends PreTrainedModel { -} -export class T5Model extends T5PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } +export class T5PreTrainedModel extends PreTrainedModel {} +export class T5Model extends T5PreTrainedModel {} /** * T5Model is a class representing a T5 model for conditional generation. */ export class T5ForConditionalGeneration extends T5PreTrainedModel { - /** - * Creates a new instance of the `T5ForConditionalGeneration` class. - * @param {Object} config The model configuration. - * @param {any} session session for the model. - * @param {any} decoder_merged_session session for the decoder. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - decoder_merged_session: any; - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: any; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: any; + /** + * Creates a new instance of the `T5ForConditionalGeneration` class. + * @param {Object} config The model configuration. + * @param {any} session session for the model. + * @param {any} decoder_merged_session session for the decoder. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + decoder_merged_session: any; + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: any; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: any; } /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class LongT5PreTrainedModel extends PreTrainedModel { -} +export class LongT5PreTrainedModel extends PreTrainedModel {} /** * The bare LONGT5 Model transformer outputting raw hidden-states without any specific head on top. */ -export class LongT5Model extends LongT5PreTrainedModel { -} +export class LongT5Model extends LongT5PreTrainedModel {} /** * LONGT5 Model with a `language modeling` head on top. */ export class LongT5ForConditionalGeneration extends LongT5PreTrainedModel { - /** - * Creates a new instance of the `LongT5ForConditionalGeneration` class. - * @param {Object} config The model configuration. - * @param {any} session session for the model. - * @param {any} decoder_merged_session session for the decoder. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - decoder_merged_session: any; - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: any; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: any; -} -export class MT5PreTrainedModel extends PreTrainedModel { -} -export class MT5Model extends MT5PreTrainedModel { + /** + * Creates a new instance of the `LongT5ForConditionalGeneration` class. + * @param {Object} config The model configuration. + * @param {any} session session for the model. + * @param {any} decoder_merged_session session for the decoder. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + decoder_merged_session: any; + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: any; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: any; } +export class MT5PreTrainedModel extends PreTrainedModel {} +export class MT5Model extends MT5PreTrainedModel {} /** * A class representing a conditional sequence-to-sequence model based on the MT5 architecture. */ export class MT5ForConditionalGeneration extends MT5PreTrainedModel { - /** - * Creates a new instance of the `MT5ForConditionalGeneration` class. - * @param {any} config The model configuration. - * @param {any} session The ONNX session containing the encoder weights. - * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - decoder_merged_session: any; - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: any; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: any; -} -export class BartPretrainedModel extends PreTrainedModel { + /** + * Creates a new instance of the `MT5ForConditionalGeneration` class. + * @param {any} config The model configuration. + * @param {any} session The ONNX session containing the encoder weights. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + decoder_merged_session: any; + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: any; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: any; } +export class BartPretrainedModel extends PreTrainedModel {} /** * The bare BART Model outputting raw hidden-states without any specific head on top. */ -export class BartModel extends BartPretrainedModel { -} +export class BartModel extends BartPretrainedModel {} /** * The BART Model with a language modeling head. Can be used for summarization. */ export class BartForConditionalGeneration extends BartPretrainedModel { - /** - * Creates a new instance of the `BartForConditionalGeneration` class. - * @param {Object} config The configuration object for the Bart model. - * @param {Object} session The ONNX session used to execute the model. - * @param {Object} decoder_merged_session The ONNX session used to execute the decoder. - * @param {Object} generation_config The generation configuration object. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: any); - decoder_merged_session: any; - generation_config: any; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: number; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: number; + /** + * Creates a new instance of the `BartForConditionalGeneration` class. + * @param {Object} config The configuration object for the Bart model. + * @param {Object} session The ONNX session used to execute the model. + * @param {Object} decoder_merged_session The ONNX session used to execute the decoder. + * @param {Object} generation_config The generation configuration object. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: any, + ); + decoder_merged_session: any; + generation_config: any; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: number; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: number; } /** * Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) */ export class BartForSequenceClassification extends BartPretrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; -} -export class MBartPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } +export class MBartPreTrainedModel extends PreTrainedModel {} /** * The bare MBART Model outputting raw hidden-states without any specific head on top. */ -export class MBartModel extends MBartPreTrainedModel { -} +export class MBartModel extends MBartPreTrainedModel {} /** * The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models. */ export class MBartForConditionalGeneration extends MBartPreTrainedModel { - /** - * Creates a new instance of the `MBartForConditionalGeneration` class. - * @param {Object} config The configuration object for the Bart model. - * @param {Object} session The ONNX session used to execute the model. - * @param {Object} decoder_merged_session The ONNX session used to execute the decoder. - * @param {Object} generation_config The generation configuration object. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: any); - decoder_merged_session: any; - generation_config: any; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: number; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: number; + /** + * Creates a new instance of the `MBartForConditionalGeneration` class. + * @param {Object} config The configuration object for the Bart model. + * @param {Object} session The ONNX session used to execute the model. + * @param {Object} decoder_merged_session The ONNX session used to execute the decoder. + * @param {Object} generation_config The generation configuration object. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: any, + ); + decoder_merged_session: any; + generation_config: any; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: number; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: number; } /** * MBart model with a sequence classification/head on top (a linear layer on top of the pooled output). */ export class MBartForSequenceClassification extends MBartPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } export class MBartForCausalLM extends MBartPreTrainedModel { - /** - * Creates a new instance of the `MBartForCausalLM` class. - * @param {Object} config Configuration object for the model. - * @param {Object} decoder_merged_session ONNX Session object for the decoder. - * @param {Object} generation_config Configuration object for the generation process. - */ - constructor(config: any, decoder_merged_session: any, generation_config: any); - generation_config: any; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: number; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: number; -} -export class BlenderbotPreTrainedModel extends PreTrainedModel { + /** + * Creates a new instance of the `MBartForCausalLM` class. + * @param {Object} config Configuration object for the model. + * @param {Object} decoder_merged_session ONNX Session object for the decoder. + * @param {Object} generation_config Configuration object for the generation process. + */ + constructor(config: any, decoder_merged_session: any, generation_config: any); + generation_config: any; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: number; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: number; } +export class BlenderbotPreTrainedModel extends PreTrainedModel {} /** * The bare Blenderbot Model outputting raw hidden-states without any specific head on top. */ -export class BlenderbotModel extends BlenderbotPreTrainedModel { -} +export class BlenderbotModel extends BlenderbotPreTrainedModel {} /** * The Blenderbot Model with a language modeling head. Can be used for summarization. */ export class BlenderbotForConditionalGeneration extends BlenderbotPreTrainedModel { - /** - * Creates a new instance of the `BlenderbotForConditionalGeneration` class. - * @param {any} config The model configuration. - * @param {any} session The ONNX session containing the encoder weights. - * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - decoder_merged_session: any; - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: number; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: number; -} -export class BlenderbotSmallPreTrainedModel extends PreTrainedModel { + /** + * Creates a new instance of the `BlenderbotForConditionalGeneration` class. + * @param {any} config The model configuration. + * @param {any} session The ONNX session containing the encoder weights. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + decoder_merged_session: any; + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: number; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: number; } +export class BlenderbotSmallPreTrainedModel extends PreTrainedModel {} /** * The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top. */ -export class BlenderbotSmallModel extends BlenderbotSmallPreTrainedModel { -} +export class BlenderbotSmallModel extends BlenderbotSmallPreTrainedModel {} /** * The BlenderbotSmall Model with a language modeling head. Can be used for summarization. */ export class BlenderbotSmallForConditionalGeneration extends BlenderbotSmallPreTrainedModel { - /** - * Creates a new instance of the `BlenderbotForConditionalGeneration` class. - * @param {any} config The model configuration. - * @param {any} session The ONNX session containing the encoder weights. - * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - decoder_merged_session: any; - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: number; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: number; -} -export class RobertaPreTrainedModel extends PreTrainedModel { -} -export class RobertaModel extends RobertaPreTrainedModel { + /** + * Creates a new instance of the `BlenderbotForConditionalGeneration` class. + * @param {any} config The model configuration. + * @param {any} session The ONNX session containing the encoder weights. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + decoder_merged_session: any; + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: number; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: number; } +export class RobertaPreTrainedModel extends PreTrainedModel {} +export class RobertaModel extends RobertaPreTrainedModel {} /** * RobertaForMaskedLM class for performing masked language modeling on Roberta models. */ export class RobertaForMaskedLM extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } /** * RobertaForSequenceClassification class for performing sequence classification on Roberta models. */ export class RobertaForSequenceClassification extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } /** * RobertaForTokenClassification class for performing token classification on Roberta models. */ export class RobertaForTokenClassification extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * RobertaForQuestionAnswering class for performing question answering on Roberta models. */ export class RobertaForQuestionAnswering extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class XLMPreTrainedModel extends PreTrainedModel { -} +export class XLMPreTrainedModel extends PreTrainedModel {} /** * The bare XLM Model transformer outputting raw hidden-states without any specific head on top. */ -export class XLMModel extends XLMPreTrainedModel { -} +export class XLMModel extends XLMPreTrainedModel {} /** * The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ export class XLMWithLMHeadModel extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } /** * XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) */ export class XLMForSequenceClassification extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } /** * XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) */ export class XLMForTokenClassification extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * XLM Model with a span classification head on top for extractive question-answering tasks */ export class XLMForQuestionAnswering extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; -} -export class XLMRobertaPreTrainedModel extends PreTrainedModel { -} -export class XLMRobertaModel extends XLMRobertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } +export class XLMRobertaPreTrainedModel extends PreTrainedModel {} +export class XLMRobertaModel extends XLMRobertaPreTrainedModel {} /** * XLMRobertaForMaskedLM class for performing masked language modeling on XLMRoberta models. */ export class XLMRobertaForMaskedLM extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } /** * XLMRobertaForSequenceClassification class for performing sequence classification on XLMRoberta models. */ export class XLMRobertaForSequenceClassification extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } /** * XLMRobertaForTokenClassification class for performing token classification on XLMRoberta models. */ export class XLMRobertaForTokenClassification extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + _call(model_inputs: any): Promise; } /** * XLMRobertaForQuestionAnswering class for performing question answering on XLMRoberta models. */ export class XLMRobertaForQuestionAnswering extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - _call(model_inputs: any): Promise; -} -export class ASTPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + _call(model_inputs: any): Promise; } +export class ASTPreTrainedModel extends PreTrainedModel {} /** * The bare AST Model transformer outputting raw hidden-states without any specific head on top. */ -export class ASTModel extends ASTPreTrainedModel { -} +export class ASTModel extends ASTPreTrainedModel {} /** * Audio Spectrogram Transformer model with an audio classification head on top * (a linear layer on top of the pooled output) e.g. for datasets like AudioSet, Speech Commands v2. */ -export class ASTForAudioClassification extends ASTPreTrainedModel { -} -export class WhisperPreTrainedModel extends PreTrainedModel { -} +export class ASTForAudioClassification extends ASTPreTrainedModel {} +export class WhisperPreTrainedModel extends PreTrainedModel {} /** * WhisperModel class for training Whisper models without a language model head. */ -export class WhisperModel extends WhisperPreTrainedModel { -} +export class WhisperModel extends WhisperPreTrainedModel {} /** * WhisperForConditionalGeneration class for generating conditional outputs from Whisper models. */ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { - /** - * Creates a new instance of the `WhisperForConditionalGeneration` class. - * @param {Object} config Configuration object for the model. - * @param {Object} session ONNX Session object for the model. - * @param {Object} decoder_merged_session ONNX Session object for the decoder. - * @param {Object} generation_config Configuration object for the generation process. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: any); - requires_attention_mask: boolean; - decoder_merged_session: any; - generation_config: any; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: number; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: number; - /** - * @typedef {Object} WhisperGenerationConfig - * @extends GenerationConfig - * @property {boolean} [return_timestamps=null] Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`. - * @property {boolean} [return_token_timestamps=null] Whether to return token-level timestamps - * with the text. This can be used with or without the `return_timestamps` option. To get word-level - * timestamps, use the tokenizer to group the tokens into words. - * @property {number} [num_frames=null] The number of audio frames available in this chunk. This is only used generating word-level timestamps. - */ - /** - * Generates outputs based on input and generation configuration. - * @param {Object} inputs Input data for the model. - * @param {WhisperGenerationConfig} generation_config Configuration object for the generation process. - * @param {Object} logits_processor Optional logits processor object. - * @returns {Promise} Promise object represents the generated outputs. - */ - generate(inputs: any, generation_config?: any, logits_processor?: any): Promise; - /** - * Calculates token-level timestamps using the encoder-decoder cross-attentions and - * dynamic time-warping (DTW) to map each output token to a position in the input audio. - * @param {Object} generate_outputs Outputs generated by the model - * @param {Tensor[][][]} generate_outputs.cross_attentions The cross attentions output by the model - * @param {Tensor[][][]} generate_outputs.decoder_attentions The decoder attentions output by the model - * @param {number[][]} generate_outputs.sequences The sequences output by the model - * @param {number[][]} alignment_heads Alignment heads of the model - * @param {number} [num_frames=null] Number of frames in the input audio. - * @param {number} [time_precision=0.02] Precision of the timestamps in seconds - * @returns {Tensor} tensor containing the timestamps in seconds for each predicted token - */ - _extract_token_timestamps(generate_outputs: { - cross_attentions: Tensor[][][]; - decoder_attentions: Tensor[][][]; - sequences: number[][]; - }, alignment_heads: number[][], num_frames?: number, time_precision?: number): Tensor; + /** + * Creates a new instance of the `WhisperForConditionalGeneration` class. + * @param {Object} config Configuration object for the model. + * @param {Object} session ONNX Session object for the model. + * @param {Object} decoder_merged_session ONNX Session object for the decoder. + * @param {Object} generation_config Configuration object for the generation process. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: any, + ); + requires_attention_mask: boolean; + decoder_merged_session: any; + generation_config: any; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: number; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: number; + /** + * @typedef {Object} WhisperGenerationConfig + * @extends GenerationConfig + * @property {boolean} [return_timestamps=null] Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`. + * @property {boolean} [return_token_timestamps=null] Whether to return token-level timestamps + * with the text. This can be used with or without the `return_timestamps` option. To get word-level + * timestamps, use the tokenizer to group the tokens into words. + * @property {number} [num_frames=null] The number of audio frames available in this chunk. This is only used generating word-level timestamps. + */ + /** + * Generates outputs based on input and generation configuration. + * @param {Object} inputs Input data for the model. + * @param {WhisperGenerationConfig} generation_config Configuration object for the generation process. + * @param {Object} logits_processor Optional logits processor object. + * @returns {Promise} Promise object represents the generated outputs. + */ + generate( + inputs: any, + generation_config?: any, + logits_processor?: any, + ): Promise; + /** + * Calculates token-level timestamps using the encoder-decoder cross-attentions and + * dynamic time-warping (DTW) to map each output token to a position in the input audio. + * @param {Object} generate_outputs Outputs generated by the model + * @param {Tensor[][][]} generate_outputs.cross_attentions The cross attentions output by the model + * @param {Tensor[][][]} generate_outputs.decoder_attentions The decoder attentions output by the model + * @param {number[][]} generate_outputs.sequences The sequences output by the model + * @param {number[][]} alignment_heads Alignment heads of the model + * @param {number} [num_frames=null] Number of frames in the input audio. + * @param {number} [time_precision=0.02] Precision of the timestamps in seconds + * @returns {Tensor} tensor containing the timestamps in seconds for each predicted token + */ + _extract_token_timestamps( + generate_outputs: { + cross_attentions: Tensor[][][]; + decoder_attentions: Tensor[][][]; + sequences: number[][]; + }, + alignment_heads: number[][], + num_frames?: number, + time_precision?: number, + ): Tensor; } /** * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks */ export class VisionEncoderDecoderModel extends PreTrainedModel { - /** - * Creates a new instance of the `VisionEncoderDecoderModel` class. - * @param {Object} config The configuration object specifying the hyperparameters and other model settings. - * @param {Object} session The ONNX session containing the encoder model. - * @param {any} decoder_merged_session The ONNX session containing the merged decoder model. - * @param {Object} generation_config Configuration object for the generation process. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: any); - decoder_merged_session: any; - generation_config: any; - add_encoder_pkv: boolean; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: any; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: any; - num_layers: any; - num_heads: any; - dim_kv: any; -} -export class CLIPPreTrainedModel extends PreTrainedModel { + /** + * Creates a new instance of the `VisionEncoderDecoderModel` class. + * @param {Object} config The configuration object specifying the hyperparameters and other model settings. + * @param {Object} session The ONNX session containing the encoder model. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder model. + * @param {Object} generation_config Configuration object for the generation process. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: any, + ); + decoder_merged_session: any; + generation_config: any; + add_encoder_pkv: boolean; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: any; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: any; + num_layers: any; + num_heads: any; + dim_kv: any; } +export class CLIPPreTrainedModel extends PreTrainedModel {} /** * CLIP Text and Vision Model with a projection layers on top * @@ -1356,8 +1406,7 @@ export class CLIPPreTrainedModel extends PreTrainedModel { * // } * ``` */ -export class CLIPModel extends CLIPPreTrainedModel { -} +export class CLIPModel extends CLIPPreTrainedModel {} /** * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output) * @@ -1384,8 +1433,7 @@ export class CLIPModel extends CLIPPreTrainedModel { * // } * ``` */ -export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { -} +export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {} /** * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output) * @@ -1412,10 +1460,8 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { * // } * ``` */ -export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel { -} -export class SiglipPreTrainedModel extends PreTrainedModel { -} +export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {} +export class SiglipPreTrainedModel extends PreTrainedModel {} /** * SigLIP Text and Vision Model with a projection layers on top * @@ -1459,8 +1505,7 @@ export class SiglipPreTrainedModel extends PreTrainedModel { * // } * ``` */ -export class SiglipModel extends SiglipPreTrainedModel { -} +export class SiglipModel extends SiglipPreTrainedModel {} /** * The text model from SigLIP without any head or projection on top. * @@ -1487,8 +1532,7 @@ export class SiglipModel extends SiglipPreTrainedModel { * // } * ``` */ -export class SiglipTextModel extends SiglipPreTrainedModel { -} +export class SiglipTextModel extends SiglipPreTrainedModel {} /** * The vision model from SigLIP without any head or projection on top. * @@ -1515,16 +1559,11 @@ export class SiglipTextModel extends SiglipPreTrainedModel { * // } * ``` */ -export class SiglipVisionModel extends CLIPPreTrainedModel { -} -export class ChineseCLIPPreTrainedModel extends PreTrainedModel { -} -export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel { -} -export class CLIPSegPreTrainedModel extends PreTrainedModel { -} -export class CLIPSegModel extends CLIPSegPreTrainedModel { -} +export class SiglipVisionModel extends CLIPPreTrainedModel {} +export class ChineseCLIPPreTrainedModel extends PreTrainedModel {} +export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel {} +export class CLIPSegPreTrainedModel extends PreTrainedModel {} +export class CLIPSegModel extends CLIPSegPreTrainedModel {} /** * CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation. * @@ -1571,246 +1610,308 @@ export class CLIPSegModel extends CLIPSegPreTrainedModel { * } * ``` */ -export class CLIPSegForImageSegmentation extends CLIPSegPreTrainedModel { -} +export class CLIPSegForImageSegmentation extends CLIPSegPreTrainedModel {} export class GPT2PreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPT2PreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; -} -export class GPT2Model extends GPT2PreTrainedModel { + /** + * Creates a new instance of the `GPT2PreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } +export class GPT2Model extends GPT2PreTrainedModel {} /** * GPT-2 language model head on top of the GPT-2 base model. This model is suitable for text generation tasks. */ -export class GPT2LMHeadModel extends GPT2PreTrainedModel { -} +export class GPT2LMHeadModel extends GPT2PreTrainedModel {} export class GPTNeoPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPTNeoPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; -} -export class GPTNeoModel extends GPTNeoPreTrainedModel { -} -export class GPTNeoForCausalLM extends GPTNeoPreTrainedModel { + /** + * Creates a new instance of the `GPTNeoPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } +export class GPTNeoModel extends GPTNeoPreTrainedModel {} +export class GPTNeoForCausalLM extends GPTNeoPreTrainedModel {} export class GPTNeoXPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPTNeoXPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; -} -export class GPTNeoXModel extends GPTNeoXPreTrainedModel { -} -export class GPTNeoXForCausalLM extends GPTNeoXPreTrainedModel { + /** + * Creates a new instance of the `GPTNeoXPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } +export class GPTNeoXModel extends GPTNeoXPreTrainedModel {} +export class GPTNeoXForCausalLM extends GPTNeoXPreTrainedModel {} export class GPTJPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPTJPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; -} -export class GPTJModel extends GPTJPreTrainedModel { -} -export class GPTJForCausalLM extends GPTJPreTrainedModel { + /** + * Creates a new instance of the `GPTJPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } +export class GPTJModel extends GPTJPreTrainedModel {} +export class GPTJForCausalLM extends GPTJPreTrainedModel {} export class GPTBigCodePreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `GPTBigCodePreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; -} -export class GPTBigCodeModel extends GPTBigCodePreTrainedModel { -} -export class GPTBigCodeForCausalLM extends GPTBigCodePreTrainedModel { + /** + * Creates a new instance of the `GPTBigCodePreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } +export class GPTBigCodeModel extends GPTBigCodePreTrainedModel {} +export class GPTBigCodeForCausalLM extends GPTBigCodePreTrainedModel {} export class CodeGenPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `CodeGenPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; + /** + * Creates a new instance of the `CodeGenPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } /** * CodeGenModel is a class representing a code generation model without a language model head. */ -export class CodeGenModel extends CodeGenPreTrainedModel { -} +export class CodeGenModel extends CodeGenPreTrainedModel {} /** * CodeGenForCausalLM is a class that represents a code generation model based on the GPT-2 architecture. It extends the `CodeGenPreTrainedModel` class. */ -export class CodeGenForCausalLM extends CodeGenPreTrainedModel { -} +export class CodeGenForCausalLM extends CodeGenPreTrainedModel {} /** * The bare LLama Model outputting raw hidden-states without any specific head on top. */ export class LlamaPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `LlamaPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; + /** + * Creates a new instance of the `LlamaPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } /** * The bare LLaMA Model outputting raw hidden-states without any specific head on top. */ -export class LlamaModel extends LlamaPreTrainedModel { -} -export class LlamaForCausalLM extends LlamaPreTrainedModel { -} +export class LlamaModel extends LlamaPreTrainedModel {} +export class LlamaForCausalLM extends LlamaPreTrainedModel {} export class PhiPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `PhiPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; + /** + * Creates a new instance of the `PhiPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } /** * The bare Phi Model outputting raw hidden-states without any specific head on top. */ -export class PhiModel extends PhiPreTrainedModel { -} -export class PhiForCausalLM extends PhiPreTrainedModel { -} +export class PhiModel extends PhiPreTrainedModel {} +export class PhiForCausalLM extends PhiPreTrainedModel {} /** * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ export class BloomPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `BloomPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; + /** + * Creates a new instance of the `BloomPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } /** * The bare Bloom Model transformer outputting raw hidden-states without any specific head on top. */ -export class BloomModel extends BloomPreTrainedModel { -} +export class BloomModel extends BloomPreTrainedModel {} /** * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ -export class BloomForCausalLM extends BloomPreTrainedModel { -} +export class BloomForCausalLM extends BloomPreTrainedModel {} export class MptPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `MptPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; + /** + * Creates a new instance of the `MptPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } /** * The bare Mpt Model transformer outputting raw hidden-states without any specific head on top. */ -export class MptModel extends MptPreTrainedModel { -} +export class MptModel extends MptPreTrainedModel {} /** * The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ -export class MptForCausalLM extends MptPreTrainedModel { -} +export class MptForCausalLM extends MptPreTrainedModel {} export class OPTPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `OPTPreTrainedModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; + /** + * Creates a new instance of the `OPTPreTrainedModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } /** * The bare OPT Model outputting raw hidden-states without any specific head on top. */ -export class OPTModel extends OPTPreTrainedModel { -} +export class OPTModel extends OPTPreTrainedModel {} /** * The OPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). */ -export class OPTForCausalLM extends OPTPreTrainedModel { -} -export class ViTPreTrainedModel extends PreTrainedModel { -} -export class ViTModel extends ViTPreTrainedModel { -} +export class OPTForCausalLM extends OPTPreTrainedModel {} +export class ViTPreTrainedModel extends PreTrainedModel {} +export class ViTModel extends ViTPreTrainedModel {} export class ViTForImageClassification extends ViTPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class VitMattePreTrainedModel extends PreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class VitMattePreTrainedModel extends PreTrainedModel {} /** * ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes. * @@ -1867,151 +1968,132 @@ export class VitMattePreTrainedModel extends PreTrainedModel { * ``` */ export class VitMatteForImageMatting extends VitMattePreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class MobileViTPreTrainedModel extends PreTrainedModel { -} -export class MobileViTModel extends MobileViTPreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class MobileViTPreTrainedModel extends PreTrainedModel {} +export class MobileViTModel extends MobileViTPreTrainedModel {} export class MobileViTForImageClassification extends MobileViTPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class OwlViTPreTrainedModel extends PreTrainedModel { -} -export class OwlViTModel extends OwlViTPreTrainedModel { -} -export class OwlViTForObjectDetection extends OwlViTPreTrainedModel { -} -export class BeitPreTrainedModel extends PreTrainedModel { -} -export class BeitModel extends BeitPreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class OwlViTPreTrainedModel extends PreTrainedModel {} +export class OwlViTModel extends OwlViTPreTrainedModel {} +export class OwlViTForObjectDetection extends OwlViTPreTrainedModel {} +export class BeitPreTrainedModel extends PreTrainedModel {} +export class BeitModel extends BeitPreTrainedModel {} export class BeitForImageClassification extends BeitPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class DetrPreTrainedModel extends PreTrainedModel { -} -export class DetrModel extends DetrPreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class DetrPreTrainedModel extends PreTrainedModel {} +export class DetrModel extends DetrPreTrainedModel {} export class DetrForObjectDetection extends DetrPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } export class DetrForSegmentation extends DetrPreTrainedModel { - /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Model inputs - * @returns {Promise} Object containing segmentation outputs - */ - _call(model_inputs: any): Promise; + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Model inputs + * @returns {Promise} Object containing segmentation outputs + */ + _call(model_inputs: any): Promise; } export class DetrObjectDetectionOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification logits (including no-object) for all queries. - * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). - * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). - */ - constructor({ logits, pred_boxes }: { - logits: Tensor; - pred_boxes: Tensor; - }); - logits: Tensor; - pred_boxes: Tensor; + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification logits (including no-object) for all queries. + * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). + * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). + */ + constructor({ logits, pred_boxes }: { logits: Tensor; pred_boxes: Tensor }); + logits: Tensor; + pred_boxes: Tensor; } export class DetrSegmentationOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits The output logits of the model. - * @param {Tensor} output.pred_boxes Predicted boxes. - * @param {Tensor} output.pred_masks Predicted masks. - */ - constructor({ logits, pred_boxes, pred_masks }: { - logits: Tensor; - pred_boxes: Tensor; - pred_masks: Tensor; - }); + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits The output logits of the model. + * @param {Tensor} output.pred_boxes Predicted boxes. + * @param {Tensor} output.pred_masks Predicted masks. + */ + constructor({ + logits, + pred_boxes, + pred_masks, + }: { logits: Tensor; pred_boxes: Tensor; pred_masks: Tensor; + }); + logits: Tensor; + pred_boxes: Tensor; + pred_masks: Tensor; } -export class TableTransformerPreTrainedModel extends PreTrainedModel { -} +export class TableTransformerPreTrainedModel extends PreTrainedModel {} /** * The bare Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) * outputting raw hidden-states without any specific head on top. */ -export class TableTransformerModel extends TableTransformerPreTrainedModel { -} +export class TableTransformerModel extends TableTransformerPreTrainedModel {} /** * Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) * with object detection heads on top, for tasks such as COCO detection. */ export class TableTransformerForObjectDetection extends TableTransformerPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class TableTransformerObjectDetectionOutput extends DetrObjectDetectionOutput { -} -export class DeiTPreTrainedModel extends PreTrainedModel { -} -export class DeiTModel extends DeiTPreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class TableTransformerObjectDetectionOutput extends DetrObjectDetectionOutput {} +export class DeiTPreTrainedModel extends PreTrainedModel {} +export class DeiTModel extends DeiTPreTrainedModel {} export class DeiTForImageClassification extends DeiTPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class ResNetPreTrainedModel extends PreTrainedModel { -} +export class ResNetPreTrainedModel extends PreTrainedModel {} /** * The bare ResNet model outputting raw features without any specific head on top. */ -export class ResNetModel extends ResNetPreTrainedModel { -} +export class ResNetModel extends ResNetPreTrainedModel {} /** * ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. */ export class ResNetForImageClassification extends ResNetPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class SwinPreTrainedModel extends PreTrainedModel { -} -export class SwinModel extends SwinPreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class SwinPreTrainedModel extends PreTrainedModel {} +export class SwinModel extends SwinPreTrainedModel {} export class SwinForImageClassification extends SwinPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class Swin2SRPreTrainedModel extends PreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class Swin2SRPreTrainedModel extends PreTrainedModel {} /** * The bare Swin2SR Model transformer outputting raw hidden-states without any specific head on top. */ -export class Swin2SRModel extends Swin2SRPreTrainedModel { -} +export class Swin2SRModel extends Swin2SRPreTrainedModel {} /** * Swin2SR Model transformer with an upsampler head on top for image super resolution and restoration. * @@ -2044,15 +2126,12 @@ export class Swin2SRModel extends Swin2SRPreTrainedModel { * // } * ``` */ -export class Swin2SRForImageSuperResolution extends Swin2SRPreTrainedModel { -} -export class DPTPreTrainedModel extends PreTrainedModel { -} +export class Swin2SRForImageSuperResolution extends Swin2SRPreTrainedModel {} +export class DPTPreTrainedModel extends PreTrainedModel {} /** * The bare DPT Model transformer outputting raw hidden-states without any specific head on top. */ -export class DPTModel extends DPTPreTrainedModel { -} +export class DPTModel extends DPTPreTrainedModel {} /** * DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. * @@ -2089,15 +2168,12 @@ export class DPTModel extends DPTPreTrainedModel { * // } * ``` */ -export class DPTForDepthEstimation extends DPTPreTrainedModel { -} -export class GLPNPreTrainedModel extends PreTrainedModel { -} +export class DPTForDepthEstimation extends DPTPreTrainedModel {} +export class GLPNPreTrainedModel extends PreTrainedModel {} /** * The bare GLPN encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top. */ -export class GLPNModel extends GLPNPreTrainedModel { -} +export class GLPNModel extends GLPNPreTrainedModel {} /** * GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2. * @@ -2134,10 +2210,8 @@ export class GLPNModel extends GLPNPreTrainedModel { * // } * ``` */ -export class GLPNForDepthEstimation extends GLPNPreTrainedModel { -} -export class DonutSwinPreTrainedModel extends PreTrainedModel { -} +export class GLPNForDepthEstimation extends GLPNPreTrainedModel {} +export class DonutSwinPreTrainedModel extends PreTrainedModel {} /** * The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top. * @@ -2212,82 +2286,69 @@ export class DonutSwinPreTrainedModel extends PreTrainedModel { * // What is the invoice number? us-001 * ``` */ -export class DonutSwinModel extends DonutSwinPreTrainedModel { -} -export class ConvNextPreTrainedModel extends PreTrainedModel { -} +export class DonutSwinModel extends DonutSwinPreTrainedModel {} +export class ConvNextPreTrainedModel extends PreTrainedModel {} /** * The bare ConvNext model outputting raw features without any specific head on top. */ -export class ConvNextModel extends ConvNextPreTrainedModel { -} +export class ConvNextModel extends ConvNextPreTrainedModel {} /** * ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. */ export class ConvNextForImageClassification extends ConvNextPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class ConvNextV2PreTrainedModel extends PreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class ConvNextV2PreTrainedModel extends PreTrainedModel {} /** * The bare ConvNextV2 model outputting raw features without any specific head on top. */ -export class ConvNextV2Model extends ConvNextV2PreTrainedModel { -} +export class ConvNextV2Model extends ConvNextV2PreTrainedModel {} /** * ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. */ export class ConvNextV2ForImageClassification extends ConvNextV2PreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class Dinov2PreTrainedModel extends PreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class Dinov2PreTrainedModel extends PreTrainedModel {} /** * The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top. */ -export class Dinov2Model extends Dinov2PreTrainedModel { -} +export class Dinov2Model extends Dinov2PreTrainedModel {} /** * Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet. */ export class Dinov2ForImageClassification extends Dinov2PreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; -} -export class YolosPreTrainedModel extends PreTrainedModel { -} -export class YolosModel extends YolosPreTrainedModel { + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } +export class YolosPreTrainedModel extends PreTrainedModel {} +export class YolosModel extends YolosPreTrainedModel {} export class YolosForObjectDetection extends YolosPreTrainedModel { - /** - * @param {any} model_inputs - */ - _call(model_inputs: any): Promise; + /** + * @param {any} model_inputs + */ + _call(model_inputs: any): Promise; } export class YolosObjectDetectionOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification logits (including no-object) for all queries. - * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). - * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). - */ - constructor({ logits, pred_boxes }: { - logits: Tensor; - pred_boxes: Tensor; - }); - logits: Tensor; - pred_boxes: Tensor; -} -export class SamPreTrainedModel extends PreTrainedModel { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification logits (including no-object) for all queries. + * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). + * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). + */ + constructor({ logits, pred_boxes }: { logits: Tensor; pred_boxes: Tensor }); + logits: Tensor; + pred_boxes: Tensor; } +export class SamPreTrainedModel extends PreTrainedModel {} /** * Segment Anything Model (SAM) for generating segmentation masks, given an input image * and optional 2D location and bounding boxes. @@ -2329,143 +2390,153 @@ export class SamPreTrainedModel extends PreTrainedModel { * ``` */ export class SamModel extends SamPreTrainedModel { + /** + * Creates a new instance of the `SamModel` class. + * @param {Object} config The configuration object specifying the hyperparameters and other model settings. + * @param {Object} vision_encoder The ONNX session containing the vision encoder model. + * @param {any} prompt_encoder_mask_decoder The ONNX session containing the prompt encoder and mask decoder model. + */ + constructor( + config: any, + vision_encoder: any, + prompt_encoder_mask_decoder: any, + ); + prompt_encoder_mask_decoder: any; + /** + * Compute image embeddings and positional image embeddings, given the pixel values of an image. + * @param {Object} model_inputs Object containing the model inputs. + * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `SamProcessor`. + * @returns {Promise<{ image_embeddings: Tensor, image_positional_embeddings: Tensor }>} The image embeddings and positional image embeddings. + */ + get_image_embeddings({ pixel_values }: { pixel_values: Tensor }): Promise<{ + image_embeddings: Tensor; + image_positional_embeddings: Tensor; + }>; + /** + * @typedef {Object} SamModelInputs Object containing the model inputs. + * @property {Tensor} pixel_values Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`. + * These can be obtained using a `SamProcessor`. + * @property {Tensor} input_points Input 2D spatial points with shape `(batch_size, num_points, 2)`. + * This is used by the prompt encoder to encode the prompt. + * @property {Tensor} [input_labels] Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`. + * This is used by the prompt encoder to encode the prompt. There are 4 types of labels: + * - `1`: the point is a point that contains the object of interest + * - `0`: the point is a point that does not contain the object of interest + * - `-1`: the point corresponds to the background + * - `-10`: the point is a padding point, thus should be ignored by the prompt encoder + * @property {Tensor} [image_embeddings] Image embeddings used by the mask decoder. + * @property {Tensor} [image_positional_embeddings] Image positional embeddings used by the mask decoder. + */ + /** + * @param {SamModelInputs} model_inputs Object containing the model inputs. + * @returns {Promise} The output of the model. + */ + forward(model_inputs: { /** - * Creates a new instance of the `SamModel` class. - * @param {Object} config The configuration object specifying the hyperparameters and other model settings. - * @param {Object} vision_encoder The ONNX session containing the vision encoder model. - * @param {any} prompt_encoder_mask_decoder The ONNX session containing the prompt encoder and mask decoder model. - */ - constructor(config: any, vision_encoder: any, prompt_encoder_mask_decoder: any); - prompt_encoder_mask_decoder: any; - /** - * Compute image embeddings and positional image embeddings, given the pixel values of an image. - * @param {Object} model_inputs Object containing the model inputs. - * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `SamProcessor`. - * @returns {Promise<{ image_embeddings: Tensor, image_positional_embeddings: Tensor }>} The image embeddings and positional image embeddings. - */ - get_image_embeddings({ pixel_values }: { - pixel_values: Tensor; - }): Promise<{ - image_embeddings: Tensor; - image_positional_embeddings: Tensor; - }>; - /** - * @typedef {Object} SamModelInputs Object containing the model inputs. - * @property {Tensor} pixel_values Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`. + * Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`. * These can be obtained using a `SamProcessor`. - * @property {Tensor} input_points Input 2D spatial points with shape `(batch_size, num_points, 2)`. + */ + pixel_values: Tensor; + /** + * Input 2D spatial points with shape `(batch_size, num_points, 2)`. * This is used by the prompt encoder to encode the prompt. - * @property {Tensor} [input_labels] Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`. + */ + input_points: Tensor; + /** + * Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`. * This is used by the prompt encoder to encode the prompt. There are 4 types of labels: - * - `1`: the point is a point that contains the object of interest - * - `0`: the point is a point that does not contain the object of interest - * - `-1`: the point corresponds to the background - * - `-10`: the point is a padding point, thus should be ignored by the prompt encoder - * @property {Tensor} [image_embeddings] Image embeddings used by the mask decoder. - * @property {Tensor} [image_positional_embeddings] Image positional embeddings used by the mask decoder. + * - `1`: the point is a point that contains the object of interest + * - `0`: the point is a point that does not contain the object of interest + * - `-1`: the point corresponds to the background + * - `-10`: the point is a padding point, thus should be ignored by the prompt encoder */ + input_labels?: Tensor; /** - * @param {SamModelInputs} model_inputs Object containing the model inputs. - * @returns {Promise} The output of the model. + * Image embeddings used by the mask decoder. */ - forward(model_inputs: { - /** - * Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`. - * These can be obtained using a `SamProcessor`. - */ - pixel_values: Tensor; - /** - * Input 2D spatial points with shape `(batch_size, num_points, 2)`. - * This is used by the prompt encoder to encode the prompt. - */ - input_points: Tensor; - /** - * Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`. - * This is used by the prompt encoder to encode the prompt. There are 4 types of labels: - * - `1`: the point is a point that contains the object of interest - * - `0`: the point is a point that does not contain the object of interest - * - `-1`: the point corresponds to the background - * - `-10`: the point is a padding point, thus should be ignored by the prompt encoder - */ - input_labels?: Tensor; - /** - * Image embeddings used by the mask decoder. - */ - image_embeddings?: Tensor; - /** - * Image positional embeddings used by the mask decoder. - */ - image_positional_embeddings?: Tensor; - }): Promise; + image_embeddings?: Tensor; /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Model inputs - * @returns {Promise} Object containing segmentation outputs + * Image positional embeddings used by the mask decoder. */ - _call(model_inputs: any): Promise; + image_positional_embeddings?: Tensor; + }): Promise; + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Model inputs + * @returns {Promise} Object containing segmentation outputs + */ + _call(model_inputs: any): Promise; } /** * Base class for Segment-Anything model's output. */ export class SamImageSegmentationOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.iou_scores The output logits of the model. - * @param {Tensor} output.pred_masks Predicted boxes. - */ - constructor({ iou_scores, pred_masks }: { - iou_scores: Tensor; - pred_masks: Tensor; - }); + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.iou_scores The output logits of the model. + * @param {Tensor} output.pred_masks Predicted boxes. + */ + constructor({ + iou_scores, + pred_masks, + }: { iou_scores: Tensor; pred_masks: Tensor; + }); + iou_scores: Tensor; + pred_masks: Tensor; } -export class MarianPreTrainedModel extends PreTrainedModel { -} -export class MarianModel extends MarianPreTrainedModel { -} +export class MarianPreTrainedModel extends PreTrainedModel {} +export class MarianModel extends MarianPreTrainedModel {} export class MarianMTModel extends MarianPreTrainedModel { - /** - * Creates a new instance of the `MarianMTModel` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {any} decoder_merged_session - * @param {any} generation_config - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: any); - decoder_merged_session: any; - generation_config: any; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: number; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: number; -} -export class M2M100PreTrainedModel extends PreTrainedModel { -} -export class M2M100Model extends M2M100PreTrainedModel { + /** + * Creates a new instance of the `MarianMTModel` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {any} decoder_merged_session + * @param {any} generation_config + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: any, + ); + decoder_merged_session: any; + generation_config: any; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: number; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: number; } +export class M2M100PreTrainedModel extends PreTrainedModel {} +export class M2M100Model extends M2M100PreTrainedModel {} export class M2M100ForConditionalGeneration extends M2M100PreTrainedModel { - /** - * Creates a new instance of the `M2M100ForConditionalGeneration` class. - * @param {Object} config The model configuration object. - * @param {Object} session The ONNX session object. - * @param {any} decoder_merged_session - * @param {any} generation_config - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: any); - decoder_merged_session: any; - generation_config: any; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: number; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: number; -} -export class Wav2Vec2PreTrainedModel extends PreTrainedModel { + /** + * Creates a new instance of the `M2M100ForConditionalGeneration` class. + * @param {Object} config The model configuration object. + * @param {Object} session The ONNX session object. + * @param {any} decoder_merged_session + * @param {any} generation_config + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: any, + ); + decoder_merged_session: any; + generation_config: any; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: number; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: number; } +export class Wav2Vec2PreTrainedModel extends PreTrainedModel {} /** * The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top. * @@ -2492,29 +2563,27 @@ export class Wav2Vec2PreTrainedModel extends PreTrainedModel { * // } * ``` */ -export class Wav2Vec2Model extends Wav2Vec2PreTrainedModel { -} +export class Wav2Vec2Model extends Wav2Vec2PreTrainedModel {} export class Wav2Vec2ForCTC extends Wav2Vec2PreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - _call(model_inputs: { - input_values: Tensor; - attention_mask: Tensor; - }): Promise; + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + _call(model_inputs: { + input_values: Tensor; + attention_mask: Tensor; + }): Promise; } export class Wav2Vec2ForSequenceClassification extends Wav2Vec2PreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; -} -export class HubertPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } +export class HubertPreTrainedModel extends PreTrainedModel {} /** * The bare Hubert Model transformer outputting raw hidden-states without any specific head on top. * @@ -2541,38 +2610,36 @@ export class HubertPreTrainedModel extends PreTrainedModel { * // } * ``` */ -export class HubertModel extends Wav2Vec2PreTrainedModel { -} +export class HubertModel extends Wav2Vec2PreTrainedModel {} /** * Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). */ export class HubertForCTC extends Wav2Vec2PreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - _call(model_inputs: { - input_values: Tensor; - attention_mask: Tensor; - }): Promise; + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + _call(model_inputs: { + input_values: Tensor; + attention_mask: Tensor; + }): Promise; } /** * Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB Keyword Spotting. */ export class HubertForSequenceClassification extends Wav2Vec2PreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class WavLMPreTrainedModel extends PreTrainedModel { -} +export class WavLMPreTrainedModel extends PreTrainedModel {} /** * The bare WavLM Model transformer outputting raw hidden-states without any specific head on top. * @@ -2599,43 +2666,40 @@ export class WavLMPreTrainedModel extends PreTrainedModel { * // } * ``` */ -export class WavLMModel extends WavLMPreTrainedModel { -} +export class WavLMModel extends WavLMPreTrainedModel {} /** * WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). */ export class WavLMForCTC extends WavLMPreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - _call(model_inputs: { - input_values: Tensor; - attention_mask: Tensor; - }): Promise; + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + _call(model_inputs: { + input_values: Tensor; + attention_mask: Tensor; + }): Promise; } /** * WavLM Model with a sequence classification head on top (a linear layer over the pooled output). */ export class WavLMForSequenceClassification extends WavLMPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - _call(model_inputs: any): Promise; + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + _call(model_inputs: any): Promise; } /** * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. */ -export class SpeechT5PreTrainedModel extends PreTrainedModel { -} +export class SpeechT5PreTrainedModel extends PreTrainedModel {} /** * The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets. */ -export class SpeechT5Model extends SpeechT5PreTrainedModel { -} +export class SpeechT5Model extends SpeechT5PreTrainedModel {} /** * SpeechT5 Model with a speech encoder and a text decoder. * @@ -2676,143 +2740,176 @@ export class SpeechT5Model extends SpeechT5PreTrainedModel { * // } * ``` */ -export class SpeechT5ForSpeechToText extends SpeechT5PreTrainedModel { -} +export class SpeechT5ForSpeechToText extends SpeechT5PreTrainedModel {} /** * SpeechT5 Model with a text encoder and a speech decoder. */ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel { + /** + * Creates a new instance of the `SpeechT5ForTextToSpeech` class. + * @param {Object} config The model configuration. + * @param {any} session session for the model. + * @param {any} decoder_merged_session session for the decoder. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + decoder_merged_session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + decoder_merged_session: any; + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_decoder_layers: any; + num_decoder_heads: any; + decoder_dim_kv: number; + num_encoder_layers: any; + num_encoder_heads: any; + encoder_dim_kv: number; + /** + * @typedef {Object} SpeechOutput + * @property {Tensor} [spectrogram] The predicted log-mel spectrogram of shape + * `(output_sequence_length, config.num_mel_bins)`. Returned when no `vocoder` is provided + * @property {Tensor} [waveform] The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided. + * @property {Tensor} [cross_attentions] The outputs of the decoder's cross-attention layers of shape + * `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length, input_sequence_length)`. returned when `output_cross_attentions` is `true`. + */ + /** + * Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a speech waveform using a vocoder. + * @param {Tensor} input_values Indices of input sequence tokens in the vocabulary. + * @param {Tensor} speaker_embeddings Tensor containing the speaker embeddings. + * @param {Object} options Optional parameters for generating speech. + * @param {number} [options.threshold=0.5] The generated sequence ends when the predicted stop token probability exceeds this value. + * @param {number} [options.minlenratio=0.0] Used to calculate the minimum required length for the output sequence. + * @param {number} [options.maxlenratio=20.0] Used to calculate the maximum allowed length for the output sequence. + * @param {Object} [options.vocoder=null] The vocoder that converts the mel spectrogram into a speech waveform. If `null`, the output is the mel spectrogram. + * @param {boolean} [options.output_cross_attentions=false] Whether or not to return the attentions tensors of the decoder's cross-attention layers. + * @returns {Promise} A promise which resolves to an object containing the spectrogram, waveform, and cross-attention tensors. + */ + generate_speech( + input_values: Tensor, + speaker_embeddings: Tensor, + { + threshold, + minlenratio, + maxlenratio, + vocoder, + }?: { + threshold?: number; + minlenratio?: number; + maxlenratio?: number; + vocoder?: any; + output_cross_attentions?: boolean; + }, + ): Promise<{ /** - * Creates a new instance of the `SpeechT5ForTextToSpeech` class. - * @param {Object} config The model configuration. - * @param {any} session session for the model. - * @param {any} decoder_merged_session session for the decoder. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, decoder_merged_session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - decoder_merged_session: any; - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_decoder_layers: any; - num_decoder_heads: any; - decoder_dim_kv: number; - num_encoder_layers: any; - num_encoder_heads: any; - encoder_dim_kv: number; - /** - * @typedef {Object} SpeechOutput - * @property {Tensor} [spectrogram] The predicted log-mel spectrogram of shape + * The predicted log-mel spectrogram of shape * `(output_sequence_length, config.num_mel_bins)`. Returned when no `vocoder` is provided - * @property {Tensor} [waveform] The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided. - * @property {Tensor} [cross_attentions] The outputs of the decoder's cross-attention layers of shape + */ + spectrogram?: Tensor; + /** + * The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided. + */ + waveform?: Tensor; + /** + * The outputs of the decoder's cross-attention layers of shape * `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length, input_sequence_length)`. returned when `output_cross_attentions` is `true`. */ - /** - * Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a speech waveform using a vocoder. - * @param {Tensor} input_values Indices of input sequence tokens in the vocabulary. - * @param {Tensor} speaker_embeddings Tensor containing the speaker embeddings. - * @param {Object} options Optional parameters for generating speech. - * @param {number} [options.threshold=0.5] The generated sequence ends when the predicted stop token probability exceeds this value. - * @param {number} [options.minlenratio=0.0] Used to calculate the minimum required length for the output sequence. - * @param {number} [options.maxlenratio=20.0] Used to calculate the maximum allowed length for the output sequence. - * @param {Object} [options.vocoder=null] The vocoder that converts the mel spectrogram into a speech waveform. If `null`, the output is the mel spectrogram. - * @param {boolean} [options.output_cross_attentions=false] Whether or not to return the attentions tensors of the decoder's cross-attention layers. - * @returns {Promise} A promise which resolves to an object containing the spectrogram, waveform, and cross-attention tensors. - */ - generate_speech(input_values: Tensor, speaker_embeddings: Tensor, { threshold, minlenratio, maxlenratio, vocoder, }?: { - threshold?: number; - minlenratio?: number; - maxlenratio?: number; - vocoder?: any; - output_cross_attentions?: boolean; - }): Promise<{ - /** - * The predicted log-mel spectrogram of shape - * `(output_sequence_length, config.num_mel_bins)`. Returned when no `vocoder` is provided - */ - spectrogram?: Tensor; - /** - * The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided. - */ - waveform?: Tensor; - /** - * The outputs of the decoder's cross-attention layers of shape - * `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length, input_sequence_length)`. returned when `output_cross_attentions` is `true`. - */ - cross_attentions?: Tensor; - }>; + cross_attentions?: Tensor; + }>; } /** * HiFi-GAN vocoder. * * See [SpeechT5ForSpeechToText](./models#module_models.SpeechT5ForSpeechToText) for example usage. */ -export class SpeechT5HifiGan extends PreTrainedModel { -} +export class SpeechT5HifiGan extends PreTrainedModel {} export class TrOCRPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `TrOCRPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_encoder_layers: any; - num_decoder_layers: any; - num_encoder_heads: any; - num_decoder_heads: any; - encoder_dim_kv: number; - decoder_dim_kv: number; + /** + * Creates a new instance of the `TrOCRPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_encoder_layers: any; + num_decoder_layers: any; + num_encoder_heads: any; + num_decoder_heads: any; + encoder_dim_kv: number; + decoder_dim_kv: number; } /** * The TrOCR Decoder with a language modeling head. */ -export class TrOCRForCausalLM extends TrOCRPreTrainedModel { -} +export class TrOCRForCausalLM extends TrOCRPreTrainedModel {} /** * The bare Mistral Model outputting raw hidden-states without any specific head on top. */ export class MistralPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `MistralPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; -} -export class MistralModel extends MistralPreTrainedModel { -} -export class MistralForCausalLM extends MistralPreTrainedModel { + /** + * Creates a new instance of the `MistralPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } +export class MistralModel extends MistralPreTrainedModel {} +export class MistralForCausalLM extends MistralPreTrainedModel {} /** * The bare Falcon Model outputting raw hidden-states without any specific head on top. */ export class FalconPreTrainedModel extends PreTrainedModel { - /** - * Creates a new instance of the `FalconPreTrainedModel` class. - * @param {Object} config The configuration of the model. - * @param {any} session The ONNX session containing the model weights. - * @param {GenerationConfig} generation_config The generation configuration. - */ - constructor(config: any, session: any, generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType); - generation_config: new (kwargs?: import("./utils/generation.js").GenerationConfigType) => import("./utils/generation.js").GenerationConfigType; - num_heads: any; - num_layers: any; - dim_kv: number; -} -export class FalconModel extends FalconPreTrainedModel { -} -export class FalconForCausalLM extends FalconPreTrainedModel { -} -export class ClapPreTrainedModel extends PreTrainedModel { -} -export class ClapModel extends ClapPreTrainedModel { + /** + * Creates a new instance of the `FalconPreTrainedModel` class. + * @param {Object} config The configuration of the model. + * @param {any} session The ONNX session containing the model weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor( + config: any, + session: any, + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType, + ); + generation_config: new ( + kwargs?: import("./utils/generation.js").GenerationConfigType, + ) => import("./utils/generation.js").GenerationConfigType; + num_heads: any; + num_layers: any; + dim_kv: number; } +export class FalconModel extends FalconPreTrainedModel {} +export class FalconForCausalLM extends FalconPreTrainedModel {} +export class ClapPreTrainedModel extends PreTrainedModel {} +export class ClapModel extends ClapPreTrainedModel {} /** * CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output). * @@ -2839,8 +2936,7 @@ export class ClapModel extends ClapPreTrainedModel { * // } * ``` */ -export class ClapTextModelWithProjection extends ClapPreTrainedModel { -} +export class ClapTextModelWithProjection extends ClapPreTrainedModel {} /** * CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output). * @@ -2867,10 +2963,8 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel { * // } * ``` */ -export class ClapAudioModelWithProjection extends ClapPreTrainedModel { -} -export class VitsPreTrainedModel extends PreTrainedModel { -} +export class ClapAudioModelWithProjection extends ClapPreTrainedModel {} +export class VitsPreTrainedModel extends PreTrainedModel {} /** * The complete VITS model, for text-to-speech synthesis. * @@ -2896,61 +2990,68 @@ export class VitsPreTrainedModel extends PreTrainedModel { * ``` */ export class VitsModel extends VitsPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} The outputs for the VITS model. - */ - _call(model_inputs: any): Promise; -} -export class SegformerPreTrainedModel extends PreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} The outputs for the VITS model. + */ + _call(model_inputs: any): Promise; } +export class SegformerPreTrainedModel extends PreTrainedModel {} /** * The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top. */ -export class SegformerModel extends SegformerPreTrainedModel { -} +export class SegformerModel extends SegformerPreTrainedModel {} /** * SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden states) e.g. for ImageNet. */ -export class SegformerForImageClassification extends SegformerPreTrainedModel { -} +export class SegformerForImageClassification extends SegformerPreTrainedModel {} /** * SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes. */ -export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel { -} +export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel {} /** * Base class of all AutoModels. Contains the `from_pretrained` function * which is used to instantiate pretrained models. */ export class PretrainedMixin { - /** - * Mapping from model type to model class. - * @type {Map[]} - */ - static MODEL_CLASS_MAPPINGS: Map[]; - /** - * Whether to attempt to instantiate the base class (`PretrainedModel`) if - * the model type is not found in the mapping. - */ - static BASE_IF_FAIL: boolean; - /** - * Instantiate one of the model classes of the library from a pretrained model. - * - * The model class to instantiate is selected based on the `model_type` property of the config object - * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) - * - * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: - * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a - * user or organization name, like `dbmdz/bert-base-german-cased`. - * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`. - * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the model. - * - * @returns {Promise} A new instance of the `PreTrainedModel` class. - */ - static from_pretrained(pretrained_model_name_or_path: string, { quantized, progress_callback, config, cache_dir, local_files_only, revision, model_file_name, }?: import("./utils/hub.js").PretrainedOptions): Promise; + /** + * Mapping from model type to model class. + * @type {Map[]} + */ + static MODEL_CLASS_MAPPINGS: Map[]; + /** + * Whether to attempt to instantiate the base class (`PretrainedModel`) if + * the model type is not found in the mapping. + */ + static BASE_IF_FAIL: boolean; + /** + * Instantiate one of the model classes of the library from a pretrained model. + * + * The model class to instantiate is selected based on the `model_type` property of the config object + * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) + * + * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: + * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + * user or organization name, like `dbmdz/bert-base-german-cased`. + * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`. + * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the model. + * + * @returns {Promise} A new instance of the `PreTrainedModel` class. + */ + static from_pretrained( + pretrained_model_name_or_path: string, + { + quantized, + progress_callback, + config, + cache_dir, + local_files_only, + revision, + model_file_name, + }?: import("./utils/hub.js").PretrainedOptions, + ): Promise; } /** * Helper class which is used to instantiate pretrained models with the `from_pretrained` function. @@ -2959,8 +3060,7 @@ export class PretrainedMixin { * @example * let model = await AutoModel.from_pretrained('bert-base-uncased'); */ -export class AutoModel extends PretrainedMixin { -} +export class AutoModel extends PretrainedMixin {} /** * Helper class which is used to instantiate pretrained sequence classification models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. @@ -2969,7 +3069,10 @@ export class AutoModel extends PretrainedMixin { * let model = await AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english'); */ export class AutoModelForSequenceClassification extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof BertForSequenceClassification)[] + >[]; } /** * Helper class which is used to instantiate pretrained token classification models with the `from_pretrained` function. @@ -2979,7 +3082,10 @@ export class AutoModelForSequenceClassification extends PretrainedMixin { * let model = await AutoModelForTokenClassification.from_pretrained('Davlan/distilbert-base-multilingual-cased-ner-hrl'); */ export class AutoModelForTokenClassification extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof BertForTokenClassification)[] + >[]; } /** * Helper class which is used to instantiate pretrained sequence-to-sequence models with the `from_pretrained` function. @@ -2989,7 +3095,11 @@ export class AutoModelForTokenClassification extends PretrainedMixin { * let model = await AutoModelForSeq2SeqLM.from_pretrained('t5-small'); */ export class AutoModelForSeq2SeqLM extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + | (string | typeof T5ForConditionalGeneration)[] + | (string | typeof BartForConditionalGeneration)[] + >[]; } /** * Helper class which is used to instantiate pretrained sequence-to-sequence speech-to-text models with the `from_pretrained` function. @@ -2999,7 +3109,11 @@ export class AutoModelForSeq2SeqLM extends PretrainedMixin { * let model = await AutoModelForSpeechSeq2Seq.from_pretrained('openai/whisper-tiny.en'); */ export class AutoModelForSpeechSeq2Seq extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + | (string | typeof SpeechT5ForSpeechToText)[] + | (string | typeof WhisperForConditionalGeneration)[] + >[]; } /** * Helper class which is used to instantiate pretrained sequence-to-sequence text-to-spectrogram models with the `from_pretrained` function. @@ -3009,7 +3123,10 @@ export class AutoModelForSpeechSeq2Seq extends PretrainedMixin { * let model = await AutoModelForTextToSpectrogram.from_pretrained('microsoft/speecht5_tts'); */ export class AutoModelForTextToSpectrogram extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof SpeechT5ForTextToSpeech)[] + >[]; } /** * Helper class which is used to instantiate pretrained text-to-waveform models with the `from_pretrained` function. @@ -3019,7 +3136,7 @@ export class AutoModelForTextToSpectrogram extends PretrainedMixin { * let model = await AutoModelForTextToSpectrogram.from_pretrained('facebook/mms-tts-eng'); */ export class AutoModelForTextToWaveform extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map[]; } /** * Helper class which is used to instantiate pretrained causal language models with the `from_pretrained` function. @@ -3029,7 +3146,10 @@ export class AutoModelForTextToWaveform extends PretrainedMixin { * let model = await AutoModelForCausalLM.from_pretrained('gpt2'); */ export class AutoModelForCausalLM extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof BloomForCausalLM)[] | (string | typeof MBartForCausalLM)[] + >[]; } /** * Helper class which is used to instantiate pretrained masked language models with the `from_pretrained` function. @@ -3039,7 +3159,10 @@ export class AutoModelForCausalLM extends PretrainedMixin { * let model = await AutoModelForMaskedLM.from_pretrained('bert-base-uncased'); */ export class AutoModelForMaskedLM extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof BertForMaskedLM)[] + >[]; } /** * Helper class which is used to instantiate pretrained question answering models with the `from_pretrained` function. @@ -3049,7 +3172,10 @@ export class AutoModelForMaskedLM extends PretrainedMixin { * let model = await AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad'); */ export class AutoModelForQuestionAnswering extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof BertForQuestionAnswering)[] + >[]; } /** * Helper class which is used to instantiate pretrained vision-to-sequence models with the `from_pretrained` function. @@ -3059,7 +3185,10 @@ export class AutoModelForQuestionAnswering extends PretrainedMixin { * let model = await AutoModelForVision2Seq.from_pretrained('nlpconnect/vit-gpt2-image-captioning'); */ export class AutoModelForVision2Seq extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof VisionEncoderDecoderModel)[] + >[]; } /** * Helper class which is used to instantiate pretrained image classification models with the `from_pretrained` function. @@ -3069,7 +3198,10 @@ export class AutoModelForVision2Seq extends PretrainedMixin { * let model = await AutoModelForImageClassification.from_pretrained('google/vit-base-patch16-224'); */ export class AutoModelForImageClassification extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof SegformerForImageClassification)[] + >[]; } /** * Helper class which is used to instantiate pretrained image segmentation models with the `from_pretrained` function. @@ -3079,7 +3211,10 @@ export class AutoModelForImageClassification extends PretrainedMixin { * let model = await AutoModelForImageSegmentation.from_pretrained('facebook/detr-resnet-50-panoptic'); */ export class AutoModelForImageSegmentation extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof CLIPSegForImageSegmentation)[] + >[]; } /** * Helper class which is used to instantiate pretrained image segmentation models with the `from_pretrained` function. @@ -3089,7 +3224,10 @@ export class AutoModelForImageSegmentation extends PretrainedMixin { * let model = await AutoModelForSemanticSegmentation.from_pretrained('nvidia/segformer-b3-finetuned-cityscapes-1024-1024'); */ export class AutoModelForSemanticSegmentation extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof SegformerForSemanticSegmentation)[] + >[]; } /** * Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function. @@ -3099,10 +3237,16 @@ export class AutoModelForSemanticSegmentation extends PretrainedMixin { * let model = await AutoModelForObjectDetection.from_pretrained('facebook/detr-resnet-50'); */ export class AutoModelForObjectDetection extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof DetrForObjectDetection)[] + >[]; } export class AutoModelForZeroShotObjectDetection extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof OwlViTForObjectDetection)[] + >[]; } /** * Helper class which is used to instantiate pretrained mask generation models with the `from_pretrained` function. @@ -3112,161 +3256,184 @@ export class AutoModelForZeroShotObjectDetection extends PretrainedMixin { * let model = await AutoModelForMaskGeneration.from_pretrained('Xenova/sam-vit-base'); */ export class AutoModelForMaskGeneration extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map[]; } export class AutoModelForCTC extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof Wav2Vec2ForCTC)[] + >[]; } export class AutoModelForAudioClassification extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof ASTForAudioClassification)[] + >[]; } export class AutoModelForDocumentQuestionAnswering extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof VisionEncoderDecoderModel)[] + >[]; } export class AutoModelForImageMatting extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof VitMatteForImageMatting)[] + >[]; } export class AutoModelForImageToImage extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof Swin2SRForImageSuperResolution)[] + >[]; } export class AutoModelForDepthEstimation extends PretrainedMixin { - static MODEL_CLASS_MAPPINGS: Map[]; + static MODEL_CLASS_MAPPINGS: Map< + string, + (string | typeof DPTForDepthEstimation)[] + >[]; } export class Seq2SeqLMOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits The output logits of the model. - * @param {Tensor} output.past_key_values An tensor of key/value pairs that represent the previous state of the model. - * @param {Tensor} output.encoder_outputs The output of the encoder in a sequence-to-sequence model. - * @param {Tensor} [output.decoder_attentions] Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. - * @param {Tensor} [output.cross_attentions] Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. - */ - constructor({ logits, past_key_values, encoder_outputs, decoder_attentions, cross_attentions }: { - logits: Tensor; - past_key_values: Tensor; - encoder_outputs: Tensor; - decoder_attentions?: Tensor; - cross_attentions?: Tensor; - }); + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits The output logits of the model. + * @param {Tensor} output.past_key_values An tensor of key/value pairs that represent the previous state of the model. + * @param {Tensor} output.encoder_outputs The output of the encoder in a sequence-to-sequence model. + * @param {Tensor} [output.decoder_attentions] Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. + * @param {Tensor} [output.cross_attentions] Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. + */ + constructor({ + logits, + past_key_values, + encoder_outputs, + decoder_attentions, + cross_attentions, + }: { logits: Tensor; past_key_values: Tensor; encoder_outputs: Tensor; - decoder_attentions: Tensor; - cross_attentions: Tensor; + decoder_attentions?: Tensor; + cross_attentions?: Tensor; + }); + logits: Tensor; + past_key_values: Tensor; + encoder_outputs: Tensor; + decoder_attentions: Tensor; + cross_attentions: Tensor; } /** * Base class for outputs of sentence classification models. */ export class SequenceClassifierOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax). - */ - constructor({ logits }: { - logits: Tensor; - }); - logits: Tensor; + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax). + */ + constructor({ logits }: { logits: Tensor }); + logits: Tensor; } /** * Base class for outputs of token classification models. */ export class TokenClassifierOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification scores (before SoftMax). - */ - constructor({ logits }: { - logits: Tensor; - }); - logits: Tensor; + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification scores (before SoftMax). + */ + constructor({ logits }: { logits: Tensor }); + logits: Tensor; } /** * Base class for masked language models outputs. */ export class MaskedLMOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - */ - constructor({ logits }: { - logits: Tensor; - }); - logits: Tensor; + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + */ + constructor({ logits }: { logits: Tensor }); + logits: Tensor; } /** * Base class for outputs of question answering models. */ export class QuestionAnsweringModelOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.start_logits Span-start scores (before SoftMax). - * @param {Tensor} output.end_logits Span-end scores (before SoftMax). - */ - constructor({ start_logits, end_logits }: { - start_logits: Tensor; - end_logits: Tensor; - }); + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.start_logits Span-start scores (before SoftMax). + * @param {Tensor} output.end_logits Span-end scores (before SoftMax). + */ + constructor({ + start_logits, + end_logits, + }: { start_logits: Tensor; end_logits: Tensor; + }); + start_logits: Tensor; + end_logits: Tensor; } /** * Base class for causal language model (or autoregressive) outputs. */ export class CausalLMOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). - */ - constructor({ logits }: { - logits: Tensor; - }); - logits: Tensor; + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). + */ + constructor({ logits }: { logits: Tensor }); + logits: Tensor; } /** * Base class for causal language model (or autoregressive) outputs. */ export class CausalLMOutputWithPast extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). - * @param {Tensor} output.past_key_values Contains pre-computed hidden-states (key and values in the self-attention blocks) - * that can be used (see `past_key_values` input) to speed up sequential decoding. - */ - constructor({ logits, past_key_values }: { - logits: Tensor; - past_key_values: Tensor; - }); + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). + * @param {Tensor} output.past_key_values Contains pre-computed hidden-states (key and values in the self-attention blocks) + * that can be used (see `past_key_values` input) to speed up sequential decoding. + */ + constructor({ + logits, + past_key_values, + }: { logits: Tensor; past_key_values: Tensor; + }); + logits: Tensor; + past_key_values: Tensor; } export class ImageMattingOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.alphas Estimated alpha values, of shape `(batch_size, num_channels, height, width)`. - */ - constructor({ alphas }: { - alphas: Tensor; - }); - alphas: Tensor; + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.alphas Estimated alpha values, of shape `(batch_size, num_channels, height, width)`. + */ + constructor({ alphas }: { alphas: Tensor }); + alphas: Tensor; } /** * Describes the outputs for the VITS model. */ export class VitsModelOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.waveform The final audio waveform predicted by the model, of shape `(batch_size, sequence_length)`. - * @param {Tensor} output.spectrogram The log-mel spectrogram predicted at the output of the flow model. - * This spectrogram is passed to the Hi-Fi GAN decoder model to obtain the final audio waveform. - */ - constructor({ waveform, spectrogram }: { - waveform: Tensor; - spectrogram: Tensor; - }); + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.waveform The final audio waveform predicted by the model, of shape `(batch_size, sequence_length)`. + * @param {Tensor} output.spectrogram The log-mel spectrogram predicted at the output of the flow model. + * This spectrogram is passed to the Hi-Fi GAN decoder model to obtain the final audio waveform. + */ + constructor({ + waveform, + spectrogram, + }: { waveform: Tensor; spectrogram: Tensor; + }); + waveform: Tensor; + spectrogram: Tensor; } -export type InferenceSession = import('onnxruntime-web').InferenceSession; +export type InferenceSession = import("onnxruntime-web").InferenceSession; /** * Runs a single step of the text generation process for a given beam. * @@ -3280,13 +3447,16 @@ export type InferenceSession = import('onnxruntime-web').InferenceSession; * @returns {Promise} The output of the generation step. * @private */ -declare function decoderRunBeam(self: any, beam: { +declare function decoderRunBeam( + self: any, + beam: { input: Tensor; model_input_ids: Tensor; attention_mask: Tensor; prev_model_outputs: any; output_token_ids: number[]; -}): Promise; + }, +): Promise; /** * Starts the generation of text by initializing the beams for the given input token IDs. * @param {Object} self The text generation model object. @@ -3297,7 +3467,13 @@ declare function decoderRunBeam(self: any, beam: { * @returns {Object[]} An array of beams initialized with the given inputs and parameters. * @private */ -declare function decoderStartBeams(self: any, inputTokenIds: Tensor, generation_config: any, numOutputTokens: number, inputs_attention_mask?: Tensor): any[]; +declare function decoderStartBeams( + self: any, + inputTokenIds: Tensor, + generation_config: any, + numOutputTokens: number, + inputs_attention_mask?: Tensor, +): any[]; /** * Update a beam with a new token ID. * @param {Object} beam The beam to update. @@ -3313,6 +3489,6 @@ declare function decoderUpdatebeam(beam: any, newTokenId: number): void; * @private */ declare function encoderForward(self: any, model_inputs: any): Promise; -import { Tensor } from './utils/tensor.js'; +import { Tensor } from "./utils/tensor.js"; export {}; -//# sourceMappingURL=models.d.ts.map \ No newline at end of file +//# sourceMappingURL=models.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/pipelines.d.ts b/core/vendor/modules/@xenova/transformers/types/pipelines.d.ts index 840a5dac5..a6714cc87 100644 --- a/core/vendor/modules/@xenova/transformers/types/pipelines.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/pipelines.d.ts @@ -38,10 +38,21 @@ * @returns {Promise} A Pipeline object for the specified task. * @throws {Error} If an unsupported pipeline is requested. */ -export function pipeline(task: T, model?: string, { quantized, progress_callback, config, cache_dir, local_files_only, revision, }?: import('./utils/hub.js').PretrainedOptions): Promise; +export function pipeline( + task: T, + model?: string, + { + quantized, + progress_callback, + config, + cache_dir, + local_files_only, + revision, + }?: import("./utils/hub.js").PretrainedOptions, +): Promise; declare const Pipeline_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * @callback DisposeType Disposes the item. @@ -56,27 +67,34 @@ declare const Pipeline_base: new () => { * @extends Callable */ export class Pipeline extends Pipeline_base { - /** - * Create a new Pipeline. - * @param {Object} options An object containing the following properties: - * @param {string} [options.task] The task of the pipeline. Useful for specifying subtasks. - * @param {PreTrainedModel} [options.model] The model used by the pipeline. - * @param {PreTrainedTokenizer} [options.tokenizer=null] The tokenizer used by the pipeline (if any). - * @param {Processor} [options.processor=null] The processor used by the pipeline (if any). - */ - constructor({ task, model, tokenizer, processor }: { - task?: string; - model?: PreTrainedModel; - tokenizer?: PreTrainedTokenizer; - processor?: Processor; - }); - task: string; - model: PreTrainedModel; - tokenizer: PreTrainedTokenizer; - processor: Processor; - dispose(): Promise; + /** + * Create a new Pipeline. + * @param {Object} options An object containing the following properties: + * @param {string} [options.task] The task of the pipeline. Useful for specifying subtasks. + * @param {PreTrainedModel} [options.model] The model used by the pipeline. + * @param {PreTrainedTokenizer} [options.tokenizer=null] The tokenizer used by the pipeline (if any). + * @param {Processor} [options.processor=null] The processor used by the pipeline (if any). + */ + constructor({ + task, + model, + tokenizer, + processor, + }: { + task?: string; + model?: PreTrainedModel; + tokenizer?: PreTrainedTokenizer; + processor?: Processor; + }); + task: string; + model: PreTrainedModel; + tokenizer: PreTrainedTokenizer; + processor: Processor; + dispose(): Promise; } -declare const TextClassificationPipeline_base: new (options: TextPipelineConstructorArgs) => TextClassificationPipelineType; +declare const TextClassificationPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => TextClassificationPipelineType; /** * @typedef {Object} ModelTokenizerConstructorArgs * @property {string} task The task of the pipeline. Useful for specifying subtasks. @@ -158,9 +176,14 @@ declare const TextClassificationPipeline_base: new (options: TextPipelineConstru * ``` */ export class TextClassificationPipeline extends TextClassificationPipeline_base { - _call(texts: string | string[], options?: TextClassificationPipelineOptions): Promise; + _call( + texts: string | string[], + options?: TextClassificationPipelineOptions, + ): Promise; } -declare const TokenClassificationPipeline_base: new (options: TextPipelineConstructorArgs) => TokenClassificationPipelineType; +declare const TokenClassificationPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => TokenClassificationPipelineType; /** * @typedef {Object} TokenClassificationSingle * @property {string} word The token/word classified. This is obtained by decoding the selected tokens. @@ -211,9 +234,14 @@ declare const TokenClassificationPipeline_base: new (options: TextPipelineConstr * ``` */ export class TokenClassificationPipeline extends TokenClassificationPipeline_base { - _call(texts: string | string[], options?: TokenClassificationPipelineOptions): Promise; + _call( + texts: string | string[], + options?: TokenClassificationPipelineOptions, + ): Promise; } -declare const QuestionAnsweringPipeline_base: new (options: TextPipelineConstructorArgs) => QuestionAnsweringPipelineType; +declare const QuestionAnsweringPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => QuestionAnsweringPipelineType; /** * @typedef {Object} QuestionAnsweringOutput * @property {number} score The probability associated to the answer. @@ -248,9 +276,15 @@ declare const QuestionAnsweringPipeline_base: new (options: TextPipelineConstruc * ``` */ export class QuestionAnsweringPipeline extends QuestionAnsweringPipeline_base { - _call(question: string | string[], context: string | string[], options?: QuestionAnsweringPipelineOptions): Promise; + _call( + question: string | string[], + context: string | string[], + options?: QuestionAnsweringPipelineOptions, + ): Promise; } -declare const FillMaskPipeline_base: new (options: TextPipelineConstructorArgs) => FillMaskPipelineType; +declare const FillMaskPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => FillMaskPipelineType; /** * @typedef {Object} FillMaskSingle * @property {string} sequence The corresponding input with the mask token prediction. @@ -296,9 +330,14 @@ declare const FillMaskPipeline_base: new (options: TextPipelineConstructorArgs) * ``` */ export class FillMaskPipeline extends FillMaskPipeline_base { - _call(texts: string | string[], options?: FillMaskPipelineOptions): Promise; + _call( + texts: string | string[], + options?: FillMaskPipelineOptions, + ): Promise; } -declare const Text2TextGenerationPipeline_base: new (options: TextPipelineConstructorArgs) => Text2TextGenerationPipelineType; +declare const Text2TextGenerationPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => Text2TextGenerationPipelineType; /** * @typedef {Object} Text2TextGenerationSingle * @property {string} generated_text The generated text. @@ -324,11 +363,16 @@ declare const Text2TextGenerationPipeline_base: new (options: TextPipelineConstr * ``` */ export class Text2TextGenerationPipeline extends Text2TextGenerationPipeline_base { - /** @type {'generated_text'} */ - _key: 'generated_text'; - _call(texts: string | string[], options?: import('./utils/generation.js').GenerationConfigType): Promise; + /** @type {'generated_text'} */ + _key: "generated_text"; + _call( + texts: string | string[], + options?: import("./utils/generation.js").GenerationConfigType, + ): Promise; } -declare const SummarizationPipeline_base: new (options: TextPipelineConstructorArgs) => SummarizationPipelineType; +declare const SummarizationPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => SummarizationPipelineType; /** * @typedef {Object} SummarizationSingle * @property {string} summary_text The summary text. @@ -362,10 +406,12 @@ declare const SummarizationPipeline_base: new (options: TextPipelineConstructorA * ``` */ export class SummarizationPipeline extends SummarizationPipeline_base { - /** @type {'summary_text'} */ - _key: 'summary_text'; + /** @type {'summary_text'} */ + _key: "summary_text"; } -declare const TranslationPipeline_base: new (options: TextPipelineConstructorArgs) => TranslationPipelineType; +declare const TranslationPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => TranslationPipelineType; /** * @typedef {Object} TranslationSingle * @property {string} translation_text The translated text. @@ -424,10 +470,12 @@ declare const TranslationPipeline_base: new (options: TextPipelineConstructorArg * ``` */ export class TranslationPipeline extends TranslationPipeline_base { - /** @type {'translation_text'} */ - _key: 'translation_text'; + /** @type {'translation_text'} */ + _key: "translation_text"; } -declare const TextGenerationPipeline_base: new (options: TextPipelineConstructorArgs) => TextGenerationPipelineType; +declare const TextGenerationPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => TextGenerationPipelineType; /** * @typedef {Object} TextGenerationSingle * @property {string} generated_text The generated text. @@ -495,9 +543,14 @@ declare const TextGenerationPipeline_base: new (options: TextPipelineConstructor * ``` */ export class TextGenerationPipeline extends TextGenerationPipeline_base { - _call(texts: string | string[], options?: TextGenerationConfig): Promise; + _call( + texts: string | string[], + options?: TextGenerationConfig, + ): Promise; } -declare const ZeroShotClassificationPipeline_base: new (options: TextPipelineConstructorArgs) => ZeroShotClassificationPipelineType; +declare const ZeroShotClassificationPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => ZeroShotClassificationPipelineType; /** * @typedef {Object} ZeroShotClassificationOutput * @property {string} sequence The sequence for which this is the output. @@ -554,14 +607,20 @@ declare const ZeroShotClassificationPipeline_base: new (options: TextPipelineCon * ``` */ export class ZeroShotClassificationPipeline extends ZeroShotClassificationPipeline_base { - label2id: { - [k: string]: any; - }; - entailment_id: any; - contradiction_id: any; - _call(texts: string | string[], candidate_labels: string | string[], options?: ZeroShotClassificationPipelineOptions): Promise; + label2id: { + [k: string]: any; + }; + entailment_id: any; + contradiction_id: any; + _call( + texts: string | string[], + candidate_labels: string | string[], + options?: ZeroShotClassificationPipelineOptions, + ): Promise; } -declare const FeatureExtractionPipeline_base: new (options: TextPipelineConstructorArgs) => FeatureExtractionPipelineType; +declare const FeatureExtractionPipeline_base: new ( + options: TextPipelineConstructorArgs, +) => FeatureExtractionPipelineType; /** * @typedef {Object} FeatureExtractionPipelineOptions Parameters specific to feature extraction pipelines. * @property {'none'|'mean'|'cls'} [pooling="none"] The pooling method to use. @@ -612,9 +671,14 @@ declare const FeatureExtractionPipeline_base: new (options: TextPipelineConstruc * ``` */ export class FeatureExtractionPipeline extends FeatureExtractionPipeline_base { - _call(texts: string | string[], options?: FeatureExtractionPipelineOptions): Promise; + _call( + texts: string | string[], + options?: FeatureExtractionPipelineOptions, + ): Promise; } -declare const AudioClassificationPipeline_base: new (options: AudioPipelineConstructorArgs) => AudioClassificationPipelineType; +declare const AudioClassificationPipeline_base: new ( + options: AudioPipelineConstructorArgs, +) => AudioClassificationPipelineType; /** * @typedef {Object} AudioClassificationSingle * @property {string} label The label predicted. @@ -666,9 +730,14 @@ declare const AudioClassificationPipeline_base: new (options: AudioPipelineConst * ``` */ export class AudioClassificationPipeline extends AudioClassificationPipeline_base { - _call(audio: AudioPipelineInputs, options?: AudioClassificationPipelineOptions): Promise; + _call( + audio: AudioPipelineInputs, + options?: AudioClassificationPipelineOptions, + ): Promise; } -declare const ZeroShotAudioClassificationPipeline_base: new (options: TextAudioPipelineConstructorArgs) => ZeroShotAudioClassificationPipelineType; +declare const ZeroShotAudioClassificationPipeline_base: new ( + options: TextAudioPipelineConstructorArgs, +) => ZeroShotAudioClassificationPipelineType; /** * @typedef {Object} ZeroShotAudioClassificationOutput * @property {string} label The label identified by the model. It is one of the suggested `candidate_label`. @@ -708,9 +777,17 @@ declare const ZeroShotAudioClassificationPipeline_base: new (options: TextAudioP * ``` */ export class ZeroShotAudioClassificationPipeline extends ZeroShotAudioClassificationPipeline_base { - _call(audio: AudioPipelineInputs, candidate_labels: string[], options?: ZeroShotAudioClassificationPipelineOptions): Promise; + _call( + audio: AudioPipelineInputs, + candidate_labels: string[], + options?: ZeroShotAudioClassificationPipelineOptions, + ): Promise< + ZeroShotAudioClassificationOutput[] | ZeroShotAudioClassificationOutput[][] + >; } -declare const AutomaticSpeechRecognitionPipeline_base: new (options: TextAudioPipelineConstructorArgs) => AutomaticSpeechRecognitionPipelineType; +declare const AutomaticSpeechRecognitionPipeline_base: new ( + options: TextAudioPipelineConstructorArgs, +) => AutomaticSpeechRecognitionPipelineType; /** * @typedef {{stride: number[], input_features: Tensor, is_last: boolean, tokens?: number[], token_timestamps?: number[]}} ChunkCallbackItem * @callback ChunkCallback @@ -820,19 +897,26 @@ declare const AutomaticSpeechRecognitionPipeline_base: new (options: TextAudioPi * ``` */ export class AutomaticSpeechRecognitionPipeline extends AutomaticSpeechRecognitionPipeline_base { - _call(audio: AudioPipelineInputs, options?: AutomaticSpeechRecognitionConfig): Promise; - /** - * @type {AutomaticSpeechRecognitionPipelineCallback} - * @private - */ - private _call_wav2vec2; - /** - * @type {AutomaticSpeechRecognitionPipelineCallback} - * @private - */ - private _call_whisper; + _call( + audio: AudioPipelineInputs, + options?: AutomaticSpeechRecognitionConfig, + ): Promise< + AutomaticSpeechRecognitionOutput | AutomaticSpeechRecognitionOutput[] + >; + /** + * @type {AutomaticSpeechRecognitionPipelineCallback} + * @private + */ + private _call_wav2vec2; + /** + * @type {AutomaticSpeechRecognitionPipelineCallback} + * @private + */ + private _call_whisper; } -declare const ImageToTextPipeline_base: new (options: TextImagePipelineConstructorArgs) => ImageToTextPipelineType; +declare const ImageToTextPipeline_base: new ( + options: TextImagePipelineConstructorArgs, +) => ImageToTextPipelineType; /** * @typedef {Object} ImageToTextSingle * @property {string} generated_text The generated text. @@ -865,9 +949,14 @@ declare const ImageToTextPipeline_base: new (options: TextImagePipelineConstruct * ``` */ export class ImageToTextPipeline extends ImageToTextPipeline_base { - _call(texts: ImagePipelineInputs, options?: import('./utils/generation.js').GenerationConfigType): Promise; + _call( + texts: ImagePipelineInputs, + options?: import("./utils/generation.js").GenerationConfigType, + ): Promise; } -declare const ImageClassificationPipeline_base: new (options: ImagePipelineConstructorArgs) => ImageClassificationPipelineType; +declare const ImageClassificationPipeline_base: new ( + options: ImagePipelineConstructorArgs, +) => ImageClassificationPipelineType; /** * @typedef {Object} ImageClassificationSingle * @property {string} label The label identified by the model. @@ -925,9 +1014,14 @@ declare const ImageClassificationPipeline_base: new (options: ImagePipelineConst * ``` */ export class ImageClassificationPipeline extends ImageClassificationPipeline_base { - _call(images: ImagePipelineInputs, options?: ImageClassificationPipelineOptions): Promise; + _call( + images: ImagePipelineInputs, + options?: ImageClassificationPipelineOptions, + ): Promise; } -declare const ImageSegmentationPipeline_base: new (options: ImagePipelineConstructorArgs) => ImageSegmentationPipelineType; +declare const ImageSegmentationPipeline_base: new ( + options: ImagePipelineConstructorArgs, +) => ImageSegmentationPipelineType; /** * @typedef {Object} ImageSegmentationPipelineOutput * @property {string} label The label of the segment. @@ -966,14 +1060,19 @@ declare const ImageSegmentationPipeline_base: new (options: ImagePipelineConstru * ``` */ export class ImageSegmentationPipeline extends ImageSegmentationPipeline_base { - subtasks_mapping: { - panoptic: string; - instance: string; - semantic: string; - }; - _call(images: ImagePipelineInputs, options?: ImageSegmentationPipelineOptions): Promise; + subtasks_mapping: { + panoptic: string; + instance: string; + semantic: string; + }; + _call( + images: ImagePipelineInputs, + options?: ImageSegmentationPipelineOptions, + ): Promise; } -declare const ZeroShotImageClassificationPipeline_base: new (options: TextImagePipelineConstructorArgs) => ZeroShotImageClassificationPipelineType; +declare const ZeroShotImageClassificationPipeline_base: new ( + options: TextImagePipelineConstructorArgs, +) => ZeroShotImageClassificationPipelineType; /** * @typedef {Object} ZeroShotImageClassificationOutput * @property {string} label The label identified by the model. It is one of the suggested `candidate_label`. @@ -1009,9 +1108,17 @@ declare const ZeroShotImageClassificationPipeline_base: new (options: TextImageP * ``` */ export class ZeroShotImageClassificationPipeline extends ZeroShotImageClassificationPipeline_base { - _call(images: ImagePipelineInputs, candidate_labels: string[], options?: ZeroShotImageClassificationPipelineOptions): Promise; + _call( + images: ImagePipelineInputs, + candidate_labels: string[], + options?: ZeroShotImageClassificationPipelineOptions, + ): Promise< + ZeroShotImageClassificationOutput[] | ZeroShotImageClassificationOutput[][] + >; } -declare const ObjectDetectionPipeline_base: new (options: ImagePipelineConstructorArgs) => ObjectDetectionPipelineType; +declare const ObjectDetectionPipeline_base: new ( + options: ImagePipelineConstructorArgs, +) => ObjectDetectionPipelineType; /** * @typedef {Object} ObjectDetectionPipelineSingle * @property {string} label The class label identified by the model. @@ -1053,9 +1160,14 @@ declare const ObjectDetectionPipeline_base: new (options: ImagePipelineConstruct * ``` */ export class ObjectDetectionPipeline extends ObjectDetectionPipeline_base { - _call(images: ImagePipelineInputs, options?: ObjectDetectionPipelineOptions): Promise; + _call( + images: ImagePipelineInputs, + options?: ObjectDetectionPipelineOptions, + ): Promise; } -declare const ZeroShotObjectDetectionPipeline_base: new (options: TextImagePipelineConstructorArgs) => ZeroShotObjectDetectionPipelineType; +declare const ZeroShotObjectDetectionPipeline_base: new ( + options: TextImagePipelineConstructorArgs, +) => ZeroShotObjectDetectionPipelineType; /** * @typedef {Object} ZeroShotObjectDetectionOutput * @property {string} label Text query corresponding to the found object. @@ -1142,9 +1254,17 @@ declare const ZeroShotObjectDetectionPipeline_base: new (options: TextImagePipel * ``` */ export class ZeroShotObjectDetectionPipeline extends ZeroShotObjectDetectionPipeline_base { - _call(images: ImagePipelineInputs, candidate_labels: string[], options?: ZeroShotObjectDetectionPipelineOptions): Promise; + _call( + images: ImagePipelineInputs, + candidate_labels: string[], + options?: ZeroShotObjectDetectionPipelineOptions, + ): Promise< + ZeroShotObjectDetectionOutput[] | ZeroShotObjectDetectionOutput[][] + >; } -declare const DocumentQuestionAnsweringPipeline_base: new (options: TextImagePipelineConstructorArgs) => DocumentQuestionAnsweringPipelineType; +declare const DocumentQuestionAnsweringPipeline_base: new ( + options: TextImagePipelineConstructorArgs, +) => DocumentQuestionAnsweringPipelineType; /** * @typedef {Object} DocumentQuestionAnsweringSingle * @property {string} answer The generated text. @@ -1173,9 +1293,17 @@ declare const DocumentQuestionAnsweringPipeline_base: new (options: TextImagePip * ``` */ export class DocumentQuestionAnsweringPipeline extends DocumentQuestionAnsweringPipeline_base { - _call(image: ImageInput, question: string, options?: import('./utils/generation.js').GenerationConfigType): Promise; + _call( + image: ImageInput, + question: string, + options?: import("./utils/generation.js").GenerationConfigType, + ): Promise< + DocumentQuestionAnsweringOutput | DocumentQuestionAnsweringOutput[] + >; } -declare const TextToAudioPipeline_base: new (options: TextToAudioPipelineConstructorArgs) => TextToAudioPipelineType; +declare const TextToAudioPipeline_base: new ( + options: TextToAudioPipelineConstructorArgs, +) => TextToAudioPipelineType; /** * @typedef {Object} VocoderOptions * @property {PreTrainedModel} [vocoder] The vocoder used by the pipeline (if the model uses one). If not provided, use the default HifiGan vocoder. @@ -1232,21 +1360,31 @@ declare const TextToAudioPipeline_base: new (options: TextToAudioPipelineConstru * ``` */ export class TextToAudioPipeline extends TextToAudioPipeline_base { - DEFAULT_VOCODER_ID: string; - vocoder: PreTrainedModel; - _call(texts: string | string[], options: TextToAudioPipelineOptions): Promise; - _call_text_to_waveform(text_inputs: any): Promise<{ - audio: any; - sampling_rate: any; - }>; - _call_text_to_spectrogram(text_inputs: any, { speaker_embeddings }: { - speaker_embeddings: any; - }): Promise<{ - audio: any; - sampling_rate: any; - }>; + DEFAULT_VOCODER_ID: string; + vocoder: PreTrainedModel; + _call( + texts: string | string[], + options: TextToAudioPipelineOptions, + ): Promise; + _call_text_to_waveform(text_inputs: any): Promise<{ + audio: any; + sampling_rate: any; + }>; + _call_text_to_spectrogram( + text_inputs: any, + { + speaker_embeddings, + }: { + speaker_embeddings: any; + }, + ): Promise<{ + audio: any; + sampling_rate: any; + }>; } -declare const ImageToImagePipeline_base: new (options: ImagePipelineConstructorArgs) => ImageToImagePipelineType; +declare const ImageToImagePipeline_base: new ( + options: ImagePipelineConstructorArgs, +) => ImageToImagePipelineType; /** * @callback ImageToImagePipelineCallback Transform the image(s) passed as inputs. * @param {ImagePipelineInputs} images The images to transform. @@ -1271,9 +1409,11 @@ declare const ImageToImagePipeline_base: new (options: ImagePipelineConstructorA * ``` */ export class ImageToImagePipeline extends ImageToImagePipeline_base { - _call(images: ImagePipelineInputs): Promise; + _call(images: ImagePipelineInputs): Promise; } -declare const DepthEstimationPipeline_base: new (options: ImagePipelineConstructorArgs) => DepthEstimationPipelineType; +declare const DepthEstimationPipeline_base: new ( + options: ImagePipelineConstructorArgs, +) => DepthEstimationPipelineType; /** * @typedef {Object} DepthEstimationPipelineOutput * @property {Tensor} predicted_depth The raw depth map predicted by the model. @@ -1310,29 +1450,31 @@ declare const DepthEstimationPipeline_base: new (options: ImagePipelineConstruct * ``` */ export class DepthEstimationPipeline extends DepthEstimationPipeline_base { - _call(images: ImagePipelineInputs): Promise; + _call( + images: ImagePipelineInputs, + ): Promise; } export type ImageInput = string | RawImage | URL; export type ImagePipelineInputs = ImageInput | ImageInput[]; export type AudioInput = string | URL | Float32Array | Float64Array; export type AudioPipelineInputs = AudioInput | AudioInput[]; export type BoundingBox = { - /** - * The minimum x coordinate of the bounding box. - */ - xmin: number; - /** - * The minimum y coordinate of the bounding box. - */ - ymin: number; - /** - * The maximum x coordinate of the bounding box. - */ - xmax: number; - /** - * The maximum y coordinate of the bounding box. - */ - ymax: number; + /** + * The minimum x coordinate of the bounding box. + */ + xmin: number; + /** + * The minimum y coordinate of the bounding box. + */ + ymin: number; + /** + * The maximum x coordinate of the bounding box. + */ + xmax: number; + /** + * The maximum y coordinate of the bounding box. + */ + ymax: number; }; export type TaskType = keyof typeof SUPPORTED_TASKS; export type AliasType = keyof typeof TASK_ALIASES; @@ -1344,39 +1486,39 @@ export type PipelineType = TaskType | AliasType; * A mapping of pipeline names to their corresponding pipeline classes. */ export type SupportedTasks = { - "text-classification": TextClassificationPipeline; - "token-classification": TokenClassificationPipeline; - "question-answering": QuestionAnsweringPipeline; - "fill-mask": FillMaskPipeline; - summarization: SummarizationPipeline; - translation: TranslationPipeline; - "text2text-generation": Text2TextGenerationPipeline; - "text-generation": TextGenerationPipeline; - "zero-shot-classification": ZeroShotClassificationPipeline; - "audio-classification": AudioClassificationPipeline; - "zero-shot-audio-classification": ZeroShotAudioClassificationPipeline; - "automatic-speech-recognition": AutomaticSpeechRecognitionPipeline; - "text-to-audio": TextToAudioPipeline; - "image-to-text": ImageToTextPipeline; - "image-classification": ImageClassificationPipeline; - "image-segmentation": ImageSegmentationPipeline; - "zero-shot-image-classification": ZeroShotImageClassificationPipeline; - "object-detection": ObjectDetectionPipeline; - "zero-shot-object-detection": ZeroShotObjectDetectionPipeline; - "document-question-answering": DocumentQuestionAnsweringPipeline; - "image-to-image": ImageToImagePipeline; - "depth-estimation": DepthEstimationPipeline; - "feature-extraction": FeatureExtractionPipeline; + "text-classification": TextClassificationPipeline; + "token-classification": TokenClassificationPipeline; + "question-answering": QuestionAnsweringPipeline; + "fill-mask": FillMaskPipeline; + summarization: SummarizationPipeline; + translation: TranslationPipeline; + "text2text-generation": Text2TextGenerationPipeline; + "text-generation": TextGenerationPipeline; + "zero-shot-classification": ZeroShotClassificationPipeline; + "audio-classification": AudioClassificationPipeline; + "zero-shot-audio-classification": ZeroShotAudioClassificationPipeline; + "automatic-speech-recognition": AutomaticSpeechRecognitionPipeline; + "text-to-audio": TextToAudioPipeline; + "image-to-text": ImageToTextPipeline; + "image-classification": ImageClassificationPipeline; + "image-segmentation": ImageSegmentationPipeline; + "zero-shot-image-classification": ZeroShotImageClassificationPipeline; + "object-detection": ObjectDetectionPipeline; + "zero-shot-object-detection": ZeroShotObjectDetectionPipeline; + "document-question-answering": DocumentQuestionAnsweringPipeline; + "image-to-image": ImageToImagePipeline; + "depth-estimation": DepthEstimationPipeline; + "feature-extraction": FeatureExtractionPipeline; }; /** * A mapping from pipeline aliases to their corresponding pipeline classes. */ export type AliasTasks = { - "sentiment-analysis": TextClassificationPipeline; - ner: TokenClassificationPipeline; - asr: AutomaticSpeechRecognitionPipeline; - "text-to-speech": TextToAudioPipeline; - embeddings: FeatureExtractionPipeline; + "sentiment-analysis": TextClassificationPipeline; + ner: TokenClassificationPipeline; + asr: AutomaticSpeechRecognitionPipeline; + "text-to-speech": TextToAudioPipeline; + embeddings: FeatureExtractionPipeline; }; /** * A mapping from all pipeline names and aliases to their corresponding pipeline classes. @@ -1387,42 +1529,42 @@ export type AllTasks = SupportedTasks & AliasTasks; */ export type DisposeType = () => Promise; export type Disposable = { - /** - * A promise that resolves when the pipeline has been disposed. - */ - dispose: DisposeType; + /** + * A promise that resolves when the pipeline has been disposed. + */ + dispose: DisposeType; }; export type ModelTokenizerConstructorArgs = { - /** - * The task of the pipeline. Useful for specifying subtasks. - */ - task: string; - /** - * The model used by the pipeline. - */ - model: PreTrainedModel; - /** - * The tokenizer used by the pipeline. - */ - tokenizer: PreTrainedTokenizer; + /** + * The task of the pipeline. Useful for specifying subtasks. + */ + task: string; + /** + * The model used by the pipeline. + */ + model: PreTrainedModel; + /** + * The tokenizer used by the pipeline. + */ + tokenizer: PreTrainedTokenizer; }; /** * An object used to instantiate a text-based pipeline. */ export type TextPipelineConstructorArgs = ModelTokenizerConstructorArgs; export type ModelProcessorConstructorArgs = { - /** - * The task of the pipeline. Useful for specifying subtasks. - */ - task: string; - /** - * The model used by the pipeline. - */ - model: PreTrainedModel; - /** - * The processor used by the pipeline. - */ - processor: Processor; + /** + * The task of the pipeline. Useful for specifying subtasks. + */ + task: string; + /** + * The model used by the pipeline. + */ + model: PreTrainedModel; + /** + * The processor used by the pipeline. + */ + processor: Processor; }; /** * An object used to instantiate an audio-based pipeline. @@ -1433,898 +1575,1042 @@ export type AudioPipelineConstructorArgs = ModelProcessorConstructorArgs; */ export type ImagePipelineConstructorArgs = ModelProcessorConstructorArgs; export type ModelTokenizerProcessorConstructorArgs = { - /** - * The task of the pipeline. Useful for specifying subtasks. - */ - task: string; - /** - * The model used by the pipeline. - */ - model: PreTrainedModel; - /** - * The tokenizer used by the pipeline. - */ - tokenizer: PreTrainedTokenizer; - /** - * The processor used by the pipeline. - */ - processor: Processor; + /** + * The task of the pipeline. Useful for specifying subtasks. + */ + task: string; + /** + * The model used by the pipeline. + */ + model: PreTrainedModel; + /** + * The tokenizer used by the pipeline. + */ + tokenizer: PreTrainedTokenizer; + /** + * The processor used by the pipeline. + */ + processor: Processor; }; /** * An object used to instantiate a text- and audio-based pipeline. */ -export type TextAudioPipelineConstructorArgs = ModelTokenizerProcessorConstructorArgs; +export type TextAudioPipelineConstructorArgs = + ModelTokenizerProcessorConstructorArgs; /** * An object used to instantiate a text- and image-based pipeline. */ -export type TextImagePipelineConstructorArgs = ModelTokenizerProcessorConstructorArgs; +export type TextImagePipelineConstructorArgs = + ModelTokenizerProcessorConstructorArgs; export type TextClassificationSingle = { - /** - * The label predicted. - */ - label: string; - /** - * The corresponding probability. - */ - score: number; + /** + * The label predicted. + */ + label: string; + /** + * The corresponding probability. + */ + score: number; }; export type TextClassificationOutput = TextClassificationSingle[]; /** * Parameters specific to text classification pipelines. */ export type TextClassificationPipelineOptions = { - /** - * The number of top predictions to be returned. - */ - topk?: number; + /** + * The number of top predictions to be returned. + */ + topk?: number; }; /** * Classify the text(s) given as inputs. */ -export type TextClassificationPipelineCallback = (texts: string | string[], options?: TextClassificationPipelineOptions) => Promise; -export type TextClassificationPipelineType = TextPipelineConstructorArgs & TextClassificationPipelineCallback & Disposable; +export type TextClassificationPipelineCallback = ( + texts: string | string[], + options?: TextClassificationPipelineOptions, +) => Promise; +export type TextClassificationPipelineType = TextPipelineConstructorArgs & + TextClassificationPipelineCallback & + Disposable; export type TokenClassificationSingle = { - /** - * The token/word classified. This is obtained by decoding the selected tokens. - */ - word: string; - /** - * The corresponding probability for `entity`. - */ - score: number; - /** - * The entity predicted for that token/word. - */ - entity: string; - /** - * The index of the corresponding token in the sentence. - */ - index: number; - /** - * The index of the start of the corresponding entity in the sentence. - */ - start?: number; - /** - * The index of the end of the corresponding entity in the sentence. - */ - end?: number; + /** + * The token/word classified. This is obtained by decoding the selected tokens. + */ + word: string; + /** + * The corresponding probability for `entity`. + */ + score: number; + /** + * The entity predicted for that token/word. + */ + entity: string; + /** + * The index of the corresponding token in the sentence. + */ + index: number; + /** + * The index of the start of the corresponding entity in the sentence. + */ + start?: number; + /** + * The index of the end of the corresponding entity in the sentence. + */ + end?: number; }; export type TokenClassificationOutput = TokenClassificationSingle[]; /** * Parameters specific to token classification pipelines. */ export type TokenClassificationPipelineOptions = { - /** - * A list of labels to ignore. - */ - ignore_labels?: string[]; + /** + * A list of labels to ignore. + */ + ignore_labels?: string[]; }; /** * Classify each token of the text(s) given as inputs. */ -export type TokenClassificationPipelineCallback = (texts: string | string[], options?: TokenClassificationPipelineOptions) => Promise; -export type TokenClassificationPipelineType = TextPipelineConstructorArgs & TokenClassificationPipelineCallback & Disposable; +export type TokenClassificationPipelineCallback = ( + texts: string | string[], + options?: TokenClassificationPipelineOptions, +) => Promise; +export type TokenClassificationPipelineType = TextPipelineConstructorArgs & + TokenClassificationPipelineCallback & + Disposable; export type QuestionAnsweringOutput = { - /** - * The probability associated to the answer. - */ - score: number; - /** - * The character start index of the answer (in the tokenized version of the input). - */ - start?: number; - /** - * The character end index of the answer (in the tokenized version of the input). - */ - end?: number; - /** - * The answer to the question. - */ - answer: string; + /** + * The probability associated to the answer. + */ + score: number; + /** + * The character start index of the answer (in the tokenized version of the input). + */ + start?: number; + /** + * The character end index of the answer (in the tokenized version of the input). + */ + end?: number; + /** + * The answer to the question. + */ + answer: string; }; /** * Parameters specific to question answering pipelines. */ export type QuestionAnsweringPipelineOptions = { - /** - * The number of top answer predictions to be returned. - */ - topk?: number; + /** + * The number of top answer predictions to be returned. + */ + topk?: number; }; /** * Answer the question(s) given as inputs by using the context(s). */ -export type QuestionAnsweringPipelineCallback = (question: string | string[], context: string | string[], options?: QuestionAnsweringPipelineOptions) => Promise; -export type QuestionAnsweringPipelineType = TextPipelineConstructorArgs & QuestionAnsweringPipelineCallback & Disposable; +export type QuestionAnsweringPipelineCallback = ( + question: string | string[], + context: string | string[], + options?: QuestionAnsweringPipelineOptions, +) => Promise; +export type QuestionAnsweringPipelineType = TextPipelineConstructorArgs & + QuestionAnsweringPipelineCallback & + Disposable; export type FillMaskSingle = { - /** - * The corresponding input with the mask token prediction. - */ - sequence: string; - /** - * The corresponding probability. - */ - score: number; - /** - * The predicted token id (to replace the masked one). - */ - token: number; - /** - * The predicted token (to replace the masked one). - */ - token_str: string; + /** + * The corresponding input with the mask token prediction. + */ + sequence: string; + /** + * The corresponding probability. + */ + score: number; + /** + * The predicted token id (to replace the masked one). + */ + token: number; + /** + * The predicted token (to replace the masked one). + */ + token_str: string; }; export type FillMaskOutput = FillMaskSingle[]; /** * Parameters specific to fill mask pipelines. */ export type FillMaskPipelineOptions = { - /** - * When passed, overrides the number of predictions to return. - */ - topk?: number; + /** + * When passed, overrides the number of predictions to return. + */ + topk?: number; }; /** * Fill the masked token in the text(s) given as inputs. */ -export type FillMaskPipelineCallback = (texts: string | string[], options?: FillMaskPipelineOptions) => Promise; -export type FillMaskPipelineType = TextPipelineConstructorArgs & FillMaskPipelineCallback & Disposable; +export type FillMaskPipelineCallback = ( + texts: string | string[], + options?: FillMaskPipelineOptions, +) => Promise; +export type FillMaskPipelineType = TextPipelineConstructorArgs & + FillMaskPipelineCallback & + Disposable; export type Text2TextGenerationSingle = { - /** - * The generated text. - */ - generated_text: string; + /** + * The generated text. + */ + generated_text: string; }; export type Text2TextGenerationOutput = Text2TextGenerationSingle[]; /** * Generate the output text(s) using text(s) given as inputs. */ -export type Text2TextGenerationPipelineCallback = (texts: string | string[], options?: import('./utils/generation.js').GenerationConfigType) => Promise; -export type Text2TextGenerationPipelineType = TextPipelineConstructorArgs & Text2TextGenerationPipelineCallback & Disposable; +export type Text2TextGenerationPipelineCallback = ( + texts: string | string[], + options?: import("./utils/generation.js").GenerationConfigType, +) => Promise; +export type Text2TextGenerationPipelineType = TextPipelineConstructorArgs & + Text2TextGenerationPipelineCallback & + Disposable; export type SummarizationSingle = { - /** - * The summary text. - */ - summary_text: string; + /** + * The summary text. + */ + summary_text: string; }; export type SummarizationOutput = SummarizationSingle[]; /** * Summarize the text(s) given as inputs. */ -export type SummarizationPipelineCallback = (texts: string | string[], options?: import('./utils/generation.js').GenerationConfigType) => Promise; -export type SummarizationPipelineType = TextPipelineConstructorArgs & SummarizationPipelineCallback & Disposable; +export type SummarizationPipelineCallback = ( + texts: string | string[], + options?: import("./utils/generation.js").GenerationConfigType, +) => Promise; +export type SummarizationPipelineType = TextPipelineConstructorArgs & + SummarizationPipelineCallback & + Disposable; export type TranslationSingle = { - /** - * The translated text. - */ - translation_text: string; + /** + * The translated text. + */ + translation_text: string; }; export type TranslationOutput = TranslationSingle[]; /** * Translate the text(s) given as inputs. */ -export type TranslationPipelineCallback = (texts: string | string[], options?: import('./utils/generation.js').GenerationConfigType) => Promise; -export type TranslationPipelineType = TextPipelineConstructorArgs & TranslationPipelineCallback & Disposable; +export type TranslationPipelineCallback = ( + texts: string | string[], + options?: import("./utils/generation.js").GenerationConfigType, +) => Promise; +export type TranslationPipelineType = TextPipelineConstructorArgs & + TranslationPipelineCallback & + Disposable; export type TextGenerationSingle = { - /** - * The generated text. - */ - generated_text: string; + /** + * The generated text. + */ + generated_text: string; }; export type TextGenerationOutput = TextGenerationSingle[]; /** * Parameters specific to text-generation pipelines. */ export type TextGenerationSpecificParams = { - /** - * Whether or not to add special tokens when tokenizing the sequences. - */ - add_special_tokens?: boolean; + /** + * Whether or not to add special tokens when tokenizing the sequences. + */ + add_special_tokens?: boolean; }; -export type TextGenerationConfig = import('./utils/generation.js').GenerationConfigType & TextGenerationSpecificParams; +export type TextGenerationConfig = + import("./utils/generation.js").GenerationConfigType & + TextGenerationSpecificParams; /** * Complete the prompt(s) given as inputs. */ -export type TextGenerationPipelineCallback = (texts: string | string[], options?: TextGenerationConfig) => Promise; -export type TextGenerationPipelineType = TextPipelineConstructorArgs & TextGenerationPipelineCallback & Disposable; +export type TextGenerationPipelineCallback = ( + texts: string | string[], + options?: TextGenerationConfig, +) => Promise; +export type TextGenerationPipelineType = TextPipelineConstructorArgs & + TextGenerationPipelineCallback & + Disposable; export type ZeroShotClassificationOutput = { - /** - * The sequence for which this is the output. - */ - sequence: string; - /** - * The labels sorted by order of likelihood. - */ - labels: string[]; - /** - * The probabilities for each of the labels. - */ - scores: number[]; + /** + * The sequence for which this is the output. + */ + sequence: string; + /** + * The labels sorted by order of likelihood. + */ + labels: string[]; + /** + * The probabilities for each of the labels. + */ + scores: number[]; }; /** * Parameters specific to zero-shot classification pipelines. */ export type ZeroShotClassificationPipelineOptions = { - /** - * The template used to turn each - * candidate label into an NLI-style hypothesis. The candidate label will replace the {} placeholder. - */ - hypothesis_template?: string; - /** - * Whether or not multiple candidate labels can be true. - * If `false`, the scores are normalized such that the sum of the label likelihoods for each sequence - * is 1. If `true`, the labels are considered independent and probabilities are normalized for each - * candidate by doing a softmax of the entailment score vs. the contradiction score. - */ - multi_label?: boolean; + /** + * The template used to turn each + * candidate label into an NLI-style hypothesis. The candidate label will replace the {} placeholder. + */ + hypothesis_template?: string; + /** + * Whether or not multiple candidate labels can be true. + * If `false`, the scores are normalized such that the sum of the label likelihoods for each sequence + * is 1. If `true`, the labels are considered independent and probabilities are normalized for each + * candidate by doing a softmax of the entailment score vs. the contradiction score. + */ + multi_label?: boolean; }; /** * Classify the sequence(s) given as inputs. */ -export type ZeroShotClassificationPipelineCallback = (texts: string | string[], candidate_labels: string | string[], options?: ZeroShotClassificationPipelineOptions) => Promise; -export type ZeroShotClassificationPipelineType = TextPipelineConstructorArgs & ZeroShotClassificationPipelineCallback & Disposable; +export type ZeroShotClassificationPipelineCallback = ( + texts: string | string[], + candidate_labels: string | string[], + options?: ZeroShotClassificationPipelineOptions, +) => Promise; +export type ZeroShotClassificationPipelineType = TextPipelineConstructorArgs & + ZeroShotClassificationPipelineCallback & + Disposable; /** * Parameters specific to feature extraction pipelines. */ export type FeatureExtractionPipelineOptions = { - /** - * The pooling method to use. - */ - pooling?: 'none' | 'mean' | 'cls'; - /** - * Whether or not to normalize the embeddings in the last dimension. - */ - normalize?: boolean; + /** + * The pooling method to use. + */ + pooling?: "none" | "mean" | "cls"; + /** + * Whether or not to normalize the embeddings in the last dimension. + */ + normalize?: boolean; }; /** * Extract the features of the input(s). */ -export type FeatureExtractionPipelineCallback = (texts: string | string[], options?: FeatureExtractionPipelineOptions) => Promise; -export type FeatureExtractionPipelineType = TextPipelineConstructorArgs & FeatureExtractionPipelineCallback & Disposable; +export type FeatureExtractionPipelineCallback = ( + texts: string | string[], + options?: FeatureExtractionPipelineOptions, +) => Promise; +export type FeatureExtractionPipelineType = TextPipelineConstructorArgs & + FeatureExtractionPipelineCallback & + Disposable; export type AudioClassificationSingle = { - /** - * The label predicted. - */ - label: string; - /** - * The corresponding probability. - */ - score: number; + /** + * The label predicted. + */ + label: string; + /** + * The corresponding probability. + */ + score: number; }; export type AudioClassificationOutput = AudioClassificationSingle[]; /** * Parameters specific to audio classification pipelines. */ export type AudioClassificationPipelineOptions = { - /** - * The number of top labels that will be returned by the pipeline. - * If the provided number is `null` or higher than the number of labels available in the model configuration, - * it will default to the number of labels. - */ - topk?: number; + /** + * The number of top labels that will be returned by the pipeline. + * If the provided number is `null` or higher than the number of labels available in the model configuration, + * it will default to the number of labels. + */ + topk?: number; }; /** * Classify the sequence(s) given as inputs. */ -export type AudioClassificationPipelineCallback = (audio: AudioPipelineInputs, options?: AudioClassificationPipelineOptions) => Promise; -export type AudioClassificationPipelineType = AudioPipelineConstructorArgs & AudioClassificationPipelineCallback & Disposable; +export type AudioClassificationPipelineCallback = ( + audio: AudioPipelineInputs, + options?: AudioClassificationPipelineOptions, +) => Promise; +export type AudioClassificationPipelineType = AudioPipelineConstructorArgs & + AudioClassificationPipelineCallback & + Disposable; export type ZeroShotAudioClassificationOutput = { - /** - * The label identified by the model. It is one of the suggested `candidate_label`. - */ - label: string; - /** - * The score attributed by the model for that label (between 0 and 1). - */ - score: number; + /** + * The label identified by the model. It is one of the suggested `candidate_label`. + */ + label: string; + /** + * The score attributed by the model for that label (between 0 and 1). + */ + score: number; }; /** * Parameters specific to zero-shot audio classification pipelines. */ export type ZeroShotAudioClassificationPipelineOptions = { - /** - * The sentence used in conjunction with `candidate_labels` - * to attempt the audio classification by replacing the placeholder with the candidate_labels. - * Then likelihood is estimated by using `logits_per_audio`. - */ - hypothesis_template?: string; + /** + * The sentence used in conjunction with `candidate_labels` + * to attempt the audio classification by replacing the placeholder with the candidate_labels. + * Then likelihood is estimated by using `logits_per_audio`. + */ + hypothesis_template?: string; }; /** * Classify the sequence(s) given as inputs. */ -export type ZeroShotAudioClassificationPipelineCallback = (audio: AudioPipelineInputs, candidate_labels: string[], options?: ZeroShotAudioClassificationPipelineOptions) => Promise; -export type ZeroShotAudioClassificationPipelineType = TextAudioPipelineConstructorArgs & ZeroShotAudioClassificationPipelineCallback & Disposable; +export type ZeroShotAudioClassificationPipelineCallback = ( + audio: AudioPipelineInputs, + candidate_labels: string[], + options?: ZeroShotAudioClassificationPipelineOptions, +) => Promise< + ZeroShotAudioClassificationOutput[] | ZeroShotAudioClassificationOutput[][] +>; +export type ZeroShotAudioClassificationPipelineType = + TextAudioPipelineConstructorArgs & + ZeroShotAudioClassificationPipelineCallback & + Disposable; export type ChunkCallbackItem = { - stride: number[]; - input_features: Tensor; - is_last: boolean; - tokens?: number[]; - token_timestamps?: number[]; + stride: number[]; + input_features: Tensor; + is_last: boolean; + tokens?: number[]; + token_timestamps?: number[]; }; export type ChunkCallback = (chunk: ChunkCallbackItem) => any; export type Chunk = { - /** - * The start and end timestamp of the chunk in seconds. - */ - timestamp: [number, number]; - /** - * The recognized text. - */ - text: string; + /** + * The start and end timestamp of the chunk in seconds. + */ + timestamp: [number, number]; + /** + * The recognized text. + */ + text: string; }; export type AutomaticSpeechRecognitionOutput = { - /** - * The recognized text. - */ - text: string; - /** - * When using `return_timestamps`, the `chunks` will become a list - * containing all the various text chunks identified by the model. - */ - chunks?: Chunk[]; + /** + * The recognized text. + */ + text: string; + /** + * When using `return_timestamps`, the `chunks` will become a list + * containing all the various text chunks identified by the model. + */ + chunks?: Chunk[]; }; /** * Parameters specific to automatic-speech-recognition pipelines. */ export type AutomaticSpeechRecognitionSpecificParams = { - /** - * Whether to return timestamps or not. Default is `false`. - */ - return_timestamps?: boolean | 'word'; - /** - * The length of audio chunks to process in seconds. Default is 0 (no chunking). - */ - chunk_length_s?: number; - /** - * The length of overlap between consecutive audio chunks in seconds. If not provided, defaults to `chunk_length_s / 6`. - */ - stride_length_s?: number; - /** - * Callback function to be called with each chunk processed. - */ - chunk_callback?: ChunkCallback; - /** - * Whether to force outputting full sequences or not. Default is `false`. - */ - force_full_sequences?: boolean; - /** - * The source language. Default is `null`, meaning it should be auto-detected. Use this to potentially improve performance if the source language is known. - */ - language?: string; - /** - * The task to perform. Default is `null`, meaning it should be auto-detected. - */ - task?: string; - /** - * A list of pairs of integers which indicates a mapping from generation indices to token indices - * that will be forced before sampling. For example, [[1, 123]] means the second generated token will always be a token of index 123. - */ - forced_decoder_ids?: number[][]; - /** - * The number of frames in the input audio. - */ - num_frames?: number; + /** + * Whether to return timestamps or not. Default is `false`. + */ + return_timestamps?: boolean | "word"; + /** + * The length of audio chunks to process in seconds. Default is 0 (no chunking). + */ + chunk_length_s?: number; + /** + * The length of overlap between consecutive audio chunks in seconds. If not provided, defaults to `chunk_length_s / 6`. + */ + stride_length_s?: number; + /** + * Callback function to be called with each chunk processed. + */ + chunk_callback?: ChunkCallback; + /** + * Whether to force outputting full sequences or not. Default is `false`. + */ + force_full_sequences?: boolean; + /** + * The source language. Default is `null`, meaning it should be auto-detected. Use this to potentially improve performance if the source language is known. + */ + language?: string; + /** + * The task to perform. Default is `null`, meaning it should be auto-detected. + */ + task?: string; + /** + * A list of pairs of integers which indicates a mapping from generation indices to token indices + * that will be forced before sampling. For example, [[1, 123]] means the second generated token will always be a token of index 123. + */ + forced_decoder_ids?: number[][]; + /** + * The number of frames in the input audio. + */ + num_frames?: number; }; -export type AutomaticSpeechRecognitionConfig = import('./utils/generation.js').GenerationConfigType & AutomaticSpeechRecognitionSpecificParams; +export type AutomaticSpeechRecognitionConfig = + import("./utils/generation.js").GenerationConfigType & + AutomaticSpeechRecognitionSpecificParams; /** * Transcribe the audio sequence(s) given as inputs to text. */ -export type AutomaticSpeechRecognitionPipelineCallback = (audio: AudioPipelineInputs, options?: AutomaticSpeechRecognitionConfig) => Promise; -export type AutomaticSpeechRecognitionPipelineType = TextAudioPipelineConstructorArgs & AutomaticSpeechRecognitionPipelineCallback & Disposable; +export type AutomaticSpeechRecognitionPipelineCallback = ( + audio: AudioPipelineInputs, + options?: AutomaticSpeechRecognitionConfig, +) => Promise< + AutomaticSpeechRecognitionOutput | AutomaticSpeechRecognitionOutput[] +>; +export type AutomaticSpeechRecognitionPipelineType = + TextAudioPipelineConstructorArgs & + AutomaticSpeechRecognitionPipelineCallback & + Disposable; export type ImageToTextSingle = { - /** - * The generated text. - */ - generated_text: string; + /** + * The generated text. + */ + generated_text: string; }; export type ImageToTextOutput = ImageToTextSingle[]; /** * Assign labels to the image(s) passed as inputs. */ -export type ImageToTextPipelineCallback = (texts: ImagePipelineInputs, options?: import('./utils/generation.js').GenerationConfigType) => Promise; -export type ImageToTextPipelineType = TextImagePipelineConstructorArgs & ImageToTextPipelineCallback & Disposable; +export type ImageToTextPipelineCallback = ( + texts: ImagePipelineInputs, + options?: import("./utils/generation.js").GenerationConfigType, +) => Promise; +export type ImageToTextPipelineType = TextImagePipelineConstructorArgs & + ImageToTextPipelineCallback & + Disposable; export type ImageClassificationSingle = { - /** - * The label identified by the model. - */ - label: string; - /** - * The score attributed by the model for that label. - */ - score: number; + /** + * The label identified by the model. + */ + label: string; + /** + * The score attributed by the model for that label. + */ + score: number; }; export type ImageClassificationOutput = ImageClassificationSingle[]; /** * Parameters specific to image classification pipelines. */ export type ImageClassificationPipelineOptions = { - /** - * The number of top labels that will be returned by the pipeline. - */ - topk?: number; + /** + * The number of top labels that will be returned by the pipeline. + */ + topk?: number; }; /** * Assign labels to the image(s) passed as inputs. */ -export type ImageClassificationPipelineCallback = (images: ImagePipelineInputs, options?: ImageClassificationPipelineOptions) => Promise; -export type ImageClassificationPipelineType = ImagePipelineConstructorArgs & ImageClassificationPipelineCallback & Disposable; +export type ImageClassificationPipelineCallback = ( + images: ImagePipelineInputs, + options?: ImageClassificationPipelineOptions, +) => Promise; +export type ImageClassificationPipelineType = ImagePipelineConstructorArgs & + ImageClassificationPipelineCallback & + Disposable; export type ImageSegmentationPipelineOutput = { - /** - * The label of the segment. - */ - label: string; - /** - * The score of the segment. - */ - score: number | null; - /** - * The mask of the segment. - */ - mask: RawImage; + /** + * The label of the segment. + */ + label: string; + /** + * The score of the segment. + */ + score: number | null; + /** + * The mask of the segment. + */ + mask: RawImage; }; /** * Parameters specific to image segmentation pipelines. */ export type ImageSegmentationPipelineOptions = { - /** - * Probability threshold to filter out predicted masks. - */ - threshold?: number; - /** - * Threshold to use when turning the predicted masks into binary values. - */ - mask_threshold?: number; - /** - * Mask overlap threshold to eliminate small, disconnected segments. - */ - overlap_mask_area_threshold?: number; - /** - * Segmentation task to be performed. One of [`panoptic`, `instance`, and `semantic`], - * depending on model capabilities. If not set, the pipeline will attempt to resolve (in that order). - */ - subtask?: null | string; - /** - * List of label ids to fuse. If not set, do not fuse any labels. - */ - label_ids_to_fuse?: number[]; - /** - * List of target sizes for the input images. If not set, use the original image sizes. - */ - target_sizes?: number[][]; + /** + * Probability threshold to filter out predicted masks. + */ + threshold?: number; + /** + * Threshold to use when turning the predicted masks into binary values. + */ + mask_threshold?: number; + /** + * Mask overlap threshold to eliminate small, disconnected segments. + */ + overlap_mask_area_threshold?: number; + /** + * Segmentation task to be performed. One of [`panoptic`, `instance`, and `semantic`], + * depending on model capabilities. If not set, the pipeline will attempt to resolve (in that order). + */ + subtask?: null | string; + /** + * List of label ids to fuse. If not set, do not fuse any labels. + */ + label_ids_to_fuse?: number[]; + /** + * List of target sizes for the input images. If not set, use the original image sizes. + */ + target_sizes?: number[][]; }; /** * Segment the input images. */ -export type ImageSegmentationPipelineCallback = (images: ImagePipelineInputs, options?: ImageSegmentationPipelineOptions) => Promise; -export type ImageSegmentationPipelineType = ImagePipelineConstructorArgs & ImageSegmentationPipelineCallback & Disposable; +export type ImageSegmentationPipelineCallback = ( + images: ImagePipelineInputs, + options?: ImageSegmentationPipelineOptions, +) => Promise; +export type ImageSegmentationPipelineType = ImagePipelineConstructorArgs & + ImageSegmentationPipelineCallback & + Disposable; export type ZeroShotImageClassificationOutput = { - /** - * The label identified by the model. It is one of the suggested `candidate_label`. - */ - label: string; - /** - * The score attributed by the model for that label (between 0 and 1). - */ - score: number; + /** + * The label identified by the model. It is one of the suggested `candidate_label`. + */ + label: string; + /** + * The score attributed by the model for that label (between 0 and 1). + */ + score: number; }; /** * Parameters specific to zero-shot image classification pipelines. */ export type ZeroShotImageClassificationPipelineOptions = { - /** - * The sentence used in conjunction with `candidate_labels` - * to attempt the image classification by replacing the placeholder with the candidate_labels. - * Then likelihood is estimated by using `logits_per_image`. - */ - hypothesis_template?: string; + /** + * The sentence used in conjunction with `candidate_labels` + * to attempt the image classification by replacing the placeholder with the candidate_labels. + * Then likelihood is estimated by using `logits_per_image`. + */ + hypothesis_template?: string; }; /** * Assign labels to the image(s) passed as inputs. */ -export type ZeroShotImageClassificationPipelineCallback = (images: ImagePipelineInputs, candidate_labels: string[], options?: ZeroShotImageClassificationPipelineOptions) => Promise; -export type ZeroShotImageClassificationPipelineType = TextImagePipelineConstructorArgs & ZeroShotImageClassificationPipelineCallback & Disposable; +export type ZeroShotImageClassificationPipelineCallback = ( + images: ImagePipelineInputs, + candidate_labels: string[], + options?: ZeroShotImageClassificationPipelineOptions, +) => Promise< + ZeroShotImageClassificationOutput[] | ZeroShotImageClassificationOutput[][] +>; +export type ZeroShotImageClassificationPipelineType = + TextImagePipelineConstructorArgs & + ZeroShotImageClassificationPipelineCallback & + Disposable; export type ObjectDetectionPipelineSingle = { - /** - * The class label identified by the model. - */ - label: string; - /** - * The score attributed by the model for that label. - */ - score: number; - /** - * The bounding box of detected object in image's original size, or as a percentage if `percentage` is set to true. - */ - box: BoundingBox; + /** + * The class label identified by the model. + */ + label: string; + /** + * The score attributed by the model for that label. + */ + score: number; + /** + * The bounding box of detected object in image's original size, or as a percentage if `percentage` is set to true. + */ + box: BoundingBox; }; export type ObjectDetectionPipelineOutput = ObjectDetectionPipelineSingle[]; /** * Parameters specific to object detection pipelines. */ export type ObjectDetectionPipelineOptions = { - /** - * The threshold used to filter boxes by score. - */ - threshold?: number; - /** - * Whether to return the boxes coordinates in percentage (true) or in pixels (false). - */ - percentage?: boolean; + /** + * The threshold used to filter boxes by score. + */ + threshold?: number; + /** + * Whether to return the boxes coordinates in percentage (true) or in pixels (false). + */ + percentage?: boolean; }; /** * Detect objects (bounding boxes & classes) in the image(s) passed as inputs. */ -export type ObjectDetectionPipelineCallback = (images: ImagePipelineInputs, options?: ObjectDetectionPipelineOptions) => Promise; -export type ObjectDetectionPipelineType = ImagePipelineConstructorArgs & ObjectDetectionPipelineCallback & Disposable; +export type ObjectDetectionPipelineCallback = ( + images: ImagePipelineInputs, + options?: ObjectDetectionPipelineOptions, +) => Promise; +export type ObjectDetectionPipelineType = ImagePipelineConstructorArgs & + ObjectDetectionPipelineCallback & + Disposable; export type ZeroShotObjectDetectionOutput = { - /** - * Text query corresponding to the found object. - */ - label: string; - /** - * Score corresponding to the object (between 0 and 1). - */ - score: number; - /** - * Bounding box of the detected object in image's original size, or as a percentage if `percentage` is set to true. - */ - box: BoundingBox; + /** + * Text query corresponding to the found object. + */ + label: string; + /** + * Score corresponding to the object (between 0 and 1). + */ + score: number; + /** + * Bounding box of the detected object in image's original size, or as a percentage if `percentage` is set to true. + */ + box: BoundingBox; }; /** * Parameters specific to zero-shot object detection pipelines. */ export type ZeroShotObjectDetectionPipelineOptions = { - /** - * The probability necessary to make a prediction. - */ - threshold?: number; - /** - * The number of top predictions that will be returned by the pipeline. - * If the provided number is `null` or higher than the number of predictions available, it will default - * to the number of predictions. - */ - topk?: number; - /** - * Whether to return the boxes coordinates in percentage (true) or in pixels (false). - */ - percentage?: boolean; + /** + * The probability necessary to make a prediction. + */ + threshold?: number; + /** + * The number of top predictions that will be returned by the pipeline. + * If the provided number is `null` or higher than the number of predictions available, it will default + * to the number of predictions. + */ + topk?: number; + /** + * Whether to return the boxes coordinates in percentage (true) or in pixels (false). + */ + percentage?: boolean; }; /** * Detect objects (bounding boxes & classes) in the image(s) passed as inputs. */ -export type ZeroShotObjectDetectionPipelineCallback = (images: ImagePipelineInputs, candidate_labels: string[], options?: ZeroShotObjectDetectionPipelineOptions) => Promise; -export type ZeroShotObjectDetectionPipelineType = TextImagePipelineConstructorArgs & ZeroShotObjectDetectionPipelineCallback & Disposable; +export type ZeroShotObjectDetectionPipelineCallback = ( + images: ImagePipelineInputs, + candidate_labels: string[], + options?: ZeroShotObjectDetectionPipelineOptions, +) => Promise< + ZeroShotObjectDetectionOutput[] | ZeroShotObjectDetectionOutput[][] +>; +export type ZeroShotObjectDetectionPipelineType = + TextImagePipelineConstructorArgs & + ZeroShotObjectDetectionPipelineCallback & + Disposable; export type DocumentQuestionAnsweringSingle = { - /** - * The generated text. - */ - answer: string; + /** + * The generated text. + */ + answer: string; }; export type DocumentQuestionAnsweringOutput = DocumentQuestionAnsweringSingle[]; /** * Answer the question given as input by using the document. */ -export type DocumentQuestionAnsweringPipelineCallback = (image: ImageInput, question: string, options?: import('./utils/generation.js').GenerationConfigType) => Promise; -export type DocumentQuestionAnsweringPipelineType = TextImagePipelineConstructorArgs & DocumentQuestionAnsweringPipelineCallback & Disposable; +export type DocumentQuestionAnsweringPipelineCallback = ( + image: ImageInput, + question: string, + options?: import("./utils/generation.js").GenerationConfigType, +) => Promise< + DocumentQuestionAnsweringOutput | DocumentQuestionAnsweringOutput[] +>; +export type DocumentQuestionAnsweringPipelineType = + TextImagePipelineConstructorArgs & + DocumentQuestionAnsweringPipelineCallback & + Disposable; export type VocoderOptions = { - /** - * The vocoder used by the pipeline (if the model uses one). If not provided, use the default HifiGan vocoder. - */ - vocoder?: PreTrainedModel; + /** + * The vocoder used by the pipeline (if the model uses one). If not provided, use the default HifiGan vocoder. + */ + vocoder?: PreTrainedModel; }; -export type TextToAudioPipelineConstructorArgs = TextAudioPipelineConstructorArgs & VocoderOptions; +export type TextToAudioPipelineConstructorArgs = + TextAudioPipelineConstructorArgs & VocoderOptions; export type TextToAudioOutput = { - /** - * The generated audio waveform. - */ - audio: Float32Array; - /** - * The sampling rate of the generated audio waveform. - */ - sampling_rate: number; + /** + * The generated audio waveform. + */ + audio: Float32Array; + /** + * The sampling rate of the generated audio waveform. + */ + sampling_rate: number; }; /** * Parameters specific to text-to-audio pipelines. */ export type TextToAudioPipelineOptions = { - /** - * The speaker embeddings (if the model requires it). - */ - speaker_embeddings?: Tensor | Float32Array | string | URL; + /** + * The speaker embeddings (if the model requires it). + */ + speaker_embeddings?: Tensor | Float32Array | string | URL; }; /** * Generates speech/audio from the inputs. */ -export type TextToAudioPipelineCallback = (texts: string | string[], options: TextToAudioPipelineOptions) => Promise; -export type TextToAudioPipelineType = TextToAudioPipelineConstructorArgs & TextToAudioPipelineCallback & Disposable; +export type TextToAudioPipelineCallback = ( + texts: string | string[], + options: TextToAudioPipelineOptions, +) => Promise; +export type TextToAudioPipelineType = TextToAudioPipelineConstructorArgs & + TextToAudioPipelineCallback & + Disposable; /** * Transform the image(s) passed as inputs. */ -export type ImageToImagePipelineCallback = (images: ImagePipelineInputs) => Promise; -export type ImageToImagePipelineType = ImagePipelineConstructorArgs & ImageToImagePipelineCallback & Disposable; +export type ImageToImagePipelineCallback = ( + images: ImagePipelineInputs, +) => Promise; +export type ImageToImagePipelineType = ImagePipelineConstructorArgs & + ImageToImagePipelineCallback & + Disposable; export type DepthEstimationPipelineOutput = { - /** - * The raw depth map predicted by the model. - */ - predicted_depth: Tensor; - /** - * The processed depth map as an image (with the same size as the input image). - */ - depth: RawImage; + /** + * The raw depth map predicted by the model. + */ + predicted_depth: Tensor; + /** + * The processed depth map as an image (with the same size as the input image). + */ + depth: RawImage; }; /** * Predicts the depth for the image(s) passed as inputs. */ -export type DepthEstimationPipelineCallback = (images: ImagePipelineInputs) => Promise; -export type DepthEstimationPipelineType = ImagePipelineConstructorArgs & DepthEstimationPipelineCallback & Disposable; -import { PreTrainedModel } from './models.js'; -import { PreTrainedTokenizer } from './tokenizers.js'; -import { Processor } from './processors.js'; -import { Tensor } from './utils/tensor.js'; -import { RawImage } from './utils/image.js'; +export type DepthEstimationPipelineCallback = ( + images: ImagePipelineInputs, +) => Promise; +export type DepthEstimationPipelineType = ImagePipelineConstructorArgs & + DepthEstimationPipelineCallback & + Disposable; +import { PreTrainedModel } from "./models.js"; +import { PreTrainedTokenizer } from "./tokenizers.js"; +import { Processor } from "./processors.js"; +import { Tensor } from "./utils/tensor.js"; +import { RawImage } from "./utils/image.js"; declare const SUPPORTED_TASKS: Readonly<{ - "text-classification": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof TextClassificationPipeline; - model: typeof AutoModelForSequenceClassification; - default: { - model: string; - }; - type: string; + "text-classification": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof TextClassificationPipeline; + model: typeof AutoModelForSequenceClassification; + default: { + model: string; }; - "token-classification": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof TokenClassificationPipeline; - model: typeof AutoModelForTokenClassification; - default: { - model: string; - }; - type: string; + type: string; + }; + "token-classification": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof TokenClassificationPipeline; + model: typeof AutoModelForTokenClassification; + default: { + model: string; }; - "question-answering": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof QuestionAnsweringPipeline; - model: typeof AutoModelForQuestionAnswering; - default: { - model: string; - }; - type: string; + type: string; + }; + "question-answering": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof QuestionAnsweringPipeline; + model: typeof AutoModelForQuestionAnswering; + default: { + model: string; }; - "fill-mask": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof FillMaskPipeline; - model: typeof AutoModelForMaskedLM; - default: { - model: string; - }; - type: string; + type: string; + }; + "fill-mask": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof FillMaskPipeline; + model: typeof AutoModelForMaskedLM; + default: { + model: string; }; - summarization: { - tokenizer: typeof AutoTokenizer; - pipeline: typeof SummarizationPipeline; - model: typeof AutoModelForSeq2SeqLM; - default: { - model: string; - }; - type: string; + type: string; + }; + summarization: { + tokenizer: typeof AutoTokenizer; + pipeline: typeof SummarizationPipeline; + model: typeof AutoModelForSeq2SeqLM; + default: { + model: string; }; - translation: { - tokenizer: typeof AutoTokenizer; - pipeline: typeof TranslationPipeline; - model: typeof AutoModelForSeq2SeqLM; - default: { - model: string; - }; - type: string; + type: string; + }; + translation: { + tokenizer: typeof AutoTokenizer; + pipeline: typeof TranslationPipeline; + model: typeof AutoModelForSeq2SeqLM; + default: { + model: string; }; - "text2text-generation": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof Text2TextGenerationPipeline; - model: typeof AutoModelForSeq2SeqLM; - default: { - model: string; - }; - type: string; + type: string; + }; + "text2text-generation": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof Text2TextGenerationPipeline; + model: typeof AutoModelForSeq2SeqLM; + default: { + model: string; }; - "text-generation": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof TextGenerationPipeline; - model: typeof AutoModelForCausalLM; - default: { - model: string; - }; - type: string; + type: string; + }; + "text-generation": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof TextGenerationPipeline; + model: typeof AutoModelForCausalLM; + default: { + model: string; }; - "zero-shot-classification": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof ZeroShotClassificationPipeline; - model: typeof AutoModelForSequenceClassification; - default: { - model: string; - }; - type: string; + type: string; + }; + "zero-shot-classification": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof ZeroShotClassificationPipeline; + model: typeof AutoModelForSequenceClassification; + default: { + model: string; }; - "audio-classification": { - pipeline: typeof AudioClassificationPipeline; - model: typeof AutoModelForAudioClassification; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "audio-classification": { + pipeline: typeof AudioClassificationPipeline; + model: typeof AutoModelForAudioClassification; + processor: typeof AutoProcessor; + default: { + model: string; }; - "zero-shot-audio-classification": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof ZeroShotAudioClassificationPipeline; - model: typeof AutoModel; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "zero-shot-audio-classification": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof ZeroShotAudioClassificationPipeline; + model: typeof AutoModel; + processor: typeof AutoProcessor; + default: { + model: string; }; - "automatic-speech-recognition": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof AutomaticSpeechRecognitionPipeline; - model: (typeof AutoModelForSpeechSeq2Seq)[]; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "automatic-speech-recognition": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof AutomaticSpeechRecognitionPipeline; + model: (typeof AutoModelForSpeechSeq2Seq)[]; + processor: typeof AutoProcessor; + default: { + model: string; }; - "text-to-audio": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof TextToAudioPipeline; - model: (typeof AutoModelForTextToSpectrogram | typeof AutoModelForTextToWaveform)[]; - processor: (typeof AutoProcessor)[]; - default: { - model: string; - }; - type: string; + type: string; + }; + "text-to-audio": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof TextToAudioPipeline; + model: ( + | typeof AutoModelForTextToSpectrogram + | typeof AutoModelForTextToWaveform + )[]; + processor: (typeof AutoProcessor)[]; + default: { + model: string; }; - "image-to-text": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof ImageToTextPipeline; - model: typeof AutoModelForVision2Seq; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "image-to-text": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof ImageToTextPipeline; + model: typeof AutoModelForVision2Seq; + processor: typeof AutoProcessor; + default: { + model: string; }; - "image-classification": { - pipeline: typeof ImageClassificationPipeline; - model: typeof AutoModelForImageClassification; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "image-classification": { + pipeline: typeof ImageClassificationPipeline; + model: typeof AutoModelForImageClassification; + processor: typeof AutoProcessor; + default: { + model: string; }; - "image-segmentation": { - pipeline: typeof ImageSegmentationPipeline; - model: (typeof AutoModelForImageSegmentation)[]; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "image-segmentation": { + pipeline: typeof ImageSegmentationPipeline; + model: (typeof AutoModelForImageSegmentation)[]; + processor: typeof AutoProcessor; + default: { + model: string; }; - "zero-shot-image-classification": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof ZeroShotImageClassificationPipeline; - model: typeof AutoModel; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "zero-shot-image-classification": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof ZeroShotImageClassificationPipeline; + model: typeof AutoModel; + processor: typeof AutoProcessor; + default: { + model: string; }; - "object-detection": { - pipeline: typeof ObjectDetectionPipeline; - model: typeof AutoModelForObjectDetection; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "object-detection": { + pipeline: typeof ObjectDetectionPipeline; + model: typeof AutoModelForObjectDetection; + processor: typeof AutoProcessor; + default: { + model: string; }; - "zero-shot-object-detection": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof ZeroShotObjectDetectionPipeline; - model: typeof AutoModelForZeroShotObjectDetection; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "zero-shot-object-detection": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof ZeroShotObjectDetectionPipeline; + model: typeof AutoModelForZeroShotObjectDetection; + processor: typeof AutoProcessor; + default: { + model: string; }; - "document-question-answering": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof DocumentQuestionAnsweringPipeline; - model: typeof AutoModelForDocumentQuestionAnswering; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "document-question-answering": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof DocumentQuestionAnsweringPipeline; + model: typeof AutoModelForDocumentQuestionAnswering; + processor: typeof AutoProcessor; + default: { + model: string; }; - "image-to-image": { - pipeline: typeof ImageToImagePipeline; - model: typeof AutoModelForImageToImage; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "image-to-image": { + pipeline: typeof ImageToImagePipeline; + model: typeof AutoModelForImageToImage; + processor: typeof AutoProcessor; + default: { + model: string; }; - "depth-estimation": { - pipeline: typeof DepthEstimationPipeline; - model: typeof AutoModelForDepthEstimation; - processor: typeof AutoProcessor; - default: { - model: string; - }; - type: string; + type: string; + }; + "depth-estimation": { + pipeline: typeof DepthEstimationPipeline; + model: typeof AutoModelForDepthEstimation; + processor: typeof AutoProcessor; + default: { + model: string; }; - "feature-extraction": { - tokenizer: typeof AutoTokenizer; - pipeline: typeof FeatureExtractionPipeline; - model: typeof AutoModel; - default: { - model: string; - }; - type: string; + type: string; + }; + "feature-extraction": { + tokenizer: typeof AutoTokenizer; + pipeline: typeof FeatureExtractionPipeline; + model: typeof AutoModel; + default: { + model: string; }; + type: string; + }; }>; declare const TASK_ALIASES: Readonly<{ - "sentiment-analysis": "text-classification"; - ner: "token-classification"; - asr: "automatic-speech-recognition"; - "text-to-speech": "text-to-audio"; - embeddings: "feature-extraction"; + "sentiment-analysis": "text-classification"; + ner: "token-classification"; + asr: "automatic-speech-recognition"; + "text-to-speech": "text-to-audio"; + embeddings: "feature-extraction"; }>; -import { AutoTokenizer } from './tokenizers.js'; -import { AutoModelForSequenceClassification } from './models.js'; -import { AutoModelForTokenClassification } from './models.js'; -import { AutoModelForQuestionAnswering } from './models.js'; -import { AutoModelForMaskedLM } from './models.js'; -import { AutoModelForSeq2SeqLM } from './models.js'; -import { AutoModelForCausalLM } from './models.js'; -import { AutoModelForAudioClassification } from './models.js'; -import { AutoProcessor } from './processors.js'; -import { AutoModel } from './models.js'; -import { AutoModelForSpeechSeq2Seq } from './models.js'; -import { AutoModelForTextToSpectrogram } from './models.js'; -import { AutoModelForTextToWaveform } from './models.js'; -import { AutoModelForVision2Seq } from './models.js'; -import { AutoModelForImageClassification } from './models.js'; -import { AutoModelForImageSegmentation } from './models.js'; -import { AutoModelForObjectDetection } from './models.js'; -import { AutoModelForZeroShotObjectDetection } from './models.js'; -import { AutoModelForDocumentQuestionAnswering } from './models.js'; -import { AutoModelForImageToImage } from './models.js'; -import { AutoModelForDepthEstimation } from './models.js'; +import { AutoTokenizer } from "./tokenizers.js"; +import { AutoModelForSequenceClassification } from "./models.js"; +import { AutoModelForTokenClassification } from "./models.js"; +import { AutoModelForQuestionAnswering } from "./models.js"; +import { AutoModelForMaskedLM } from "./models.js"; +import { AutoModelForSeq2SeqLM } from "./models.js"; +import { AutoModelForCausalLM } from "./models.js"; +import { AutoModelForAudioClassification } from "./models.js"; +import { AutoProcessor } from "./processors.js"; +import { AutoModel } from "./models.js"; +import { AutoModelForSpeechSeq2Seq } from "./models.js"; +import { AutoModelForTextToSpectrogram } from "./models.js"; +import { AutoModelForTextToWaveform } from "./models.js"; +import { AutoModelForVision2Seq } from "./models.js"; +import { AutoModelForImageClassification } from "./models.js"; +import { AutoModelForImageSegmentation } from "./models.js"; +import { AutoModelForObjectDetection } from "./models.js"; +import { AutoModelForZeroShotObjectDetection } from "./models.js"; +import { AutoModelForDocumentQuestionAnswering } from "./models.js"; +import { AutoModelForImageToImage } from "./models.js"; +import { AutoModelForDepthEstimation } from "./models.js"; export {}; -//# sourceMappingURL=pipelines.d.ts.map \ No newline at end of file +//# sourceMappingURL=pipelines.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/processors.d.ts b/core/vendor/modules/@xenova/transformers/types/processors.d.ts index b604b1244..32d66aee8 100644 --- a/core/vendor/modules/@xenova/transformers/types/processors.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/processors.d.ts @@ -1,6 +1,6 @@ declare const FeatureExtractor_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * Base class for feature extractors. @@ -8,13 +8,13 @@ declare const FeatureExtractor_base: new () => { * @extends Callable */ export class FeatureExtractor extends FeatureExtractor_base { - /** - * Constructs a new FeatureExtractor instance. - * - * @param {Object} config The configuration for the feature extractor. - */ - constructor(config: any); - config: any; + /** + * Constructs a new FeatureExtractor instance. + * + * @param {Object} config The configuration for the feature extractor. + */ + constructor(config: any); + config: any; } /** * @typedef {object} ImageFeatureExtractorResult @@ -28,207 +28,228 @@ export class FeatureExtractor extends FeatureExtractor_base { * @extends FeatureExtractor */ export class ImageFeatureExtractor extends FeatureExtractor { + /** + * Constructs a new ImageFeatureExtractor instance. + * + * @param {Object} config The configuration for the feature extractor. + * @param {number[]} config.image_mean The mean values for image normalization. + * @param {number[]} config.image_std The standard deviation values for image normalization. + * @param {boolean} config.do_rescale Whether to rescale the image pixel values to the [0,1] range. + * @param {number} config.rescale_factor The factor to use for rescaling the image pixel values. + * @param {boolean} config.do_normalize Whether to normalize the image pixel values. + * @param {boolean} config.do_resize Whether to resize the image. + * @param {number} config.resample What method to use for resampling. + * @param {number} config.size The size to resize the image to. + */ + constructor(config: { + image_mean: number[]; + image_std: number[]; + do_rescale: boolean; + rescale_factor: number; + do_normalize: boolean; + do_resize: boolean; + resample: number; + size: number; + }); + image_mean: any; + image_std: any; + resample: any; + do_rescale: any; + rescale_factor: any; + do_normalize: any; + do_resize: any; + do_thumbnail: any; + size: any; + size_divisibility: any; + do_center_crop: any; + crop_size: any; + do_convert_rgb: any; + do_crop_margin: any; + pad_size: any; + do_pad: any; + /** + * Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any + * corresponding dimension of the specified size. + * @param {RawImage} image The image to be resized. + * @param {{height:number, width:number}} size The size `{"height": h, "width": w}` to resize the image to. + * @param {string | 0 | 1 | 2 | 3 | 4 | 5} [resample=2] The resampling filter to use. + * @returns {Promise} The resized image. + */ + thumbnail( + image: RawImage, + size: { + height: number; + width: number; + }, + resample?: string | 0 | 1 | 2 | 3 | 4 | 5, + ): Promise; + /** + * Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold). + * @param {RawImage} image The image to be cropped. + * @param {number} gray_threshold Value below which pixels are considered to be gray. + * @returns {Promise} The cropped image. + */ + crop_margin(image: RawImage, gray_threshold?: number): Promise; + /** + * Pad the image by a certain amount. + * @param {Float32Array} pixelData The pixel data to pad. + * @param {number[]} imgDims The dimensions of the image. + * @param {{width:number; height:number}|number} padSize The dimensions of the padded image. + * @param {Object} options The options for padding. + * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add. + * @param {boolean} [options.center=false] Whether to center the image. + * @param {number} [options.constant_values=0] The constant value to use for padding. + * @returns {[Float32Array, number[]]} The padded pixel data and image dimensions. + */ + pad_image( + pixelData: Float32Array, + imgDims: number[], + padSize: + | { + width: number; + height: number; + } + | number, + { + mode, + center, + constant_values, + }?: { + mode?: "constant" | "symmetric"; + center?: boolean; + constant_values?: number; + }, + ): [Float32Array, number[]]; + /** + * Rescale the image' pixel values by `this.rescale_factor`. + * @param {Float32Array} pixelData The pixel data to rescale. + * @returns {void} + */ + rescale(pixelData: Float32Array): void; + /** + * Find the target (width, height) dimension of the output image after + * resizing given the input image and the desired size. + * @param {RawImage} image The image to resize. + * @param {any} size The size to use for resizing the image. + * @returns {[number, number]} The target (width, height) dimension of the output image after resizing. + */ + get_resize_output_image_size(image: RawImage, size: any): [number, number]; + /** + * Resizes the image. + * @param {RawImage} image The image to resize. + * @returns {Promise} The resized image. + */ + resize(image: RawImage): Promise; + /** + * @typedef {object} PreprocessedImage + * @property {HeightWidth} original_size The original size of the image. + * @property {HeightWidth} reshaped_input_size The reshaped input size of the image. + * @property {Tensor} pixel_values The pixel values of the preprocessed image. + */ + /** + * Preprocesses the given image. + * + * @param {RawImage} image The image to preprocess. + * @param {Object} overrides The overrides for the preprocessing options. + * @returns {Promise} The preprocessed image. + */ + preprocess( + image: RawImage, + { do_normalize, do_pad, do_convert_rgb, do_convert_grayscale }?: any, + ): Promise<{ /** - * Constructs a new ImageFeatureExtractor instance. - * - * @param {Object} config The configuration for the feature extractor. - * @param {number[]} config.image_mean The mean values for image normalization. - * @param {number[]} config.image_std The standard deviation values for image normalization. - * @param {boolean} config.do_rescale Whether to rescale the image pixel values to the [0,1] range. - * @param {number} config.rescale_factor The factor to use for rescaling the image pixel values. - * @param {boolean} config.do_normalize Whether to normalize the image pixel values. - * @param {boolean} config.do_resize Whether to resize the image. - * @param {number} config.resample What method to use for resampling. - * @param {number} config.size The size to resize the image to. + * The original size of the image. */ - constructor(config: { - image_mean: number[]; - image_std: number[]; - do_rescale: boolean; - rescale_factor: number; - do_normalize: boolean; - do_resize: boolean; - resample: number; - size: number; - }); - image_mean: any; - image_std: any; - resample: any; - do_rescale: any; - rescale_factor: any; - do_normalize: any; - do_resize: any; - do_thumbnail: any; - size: any; - size_divisibility: any; - do_center_crop: any; - crop_size: any; - do_convert_rgb: any; - do_crop_margin: any; - pad_size: any; - do_pad: any; + original_size: HeightWidth; /** - * Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any - * corresponding dimension of the specified size. - * @param {RawImage} image The image to be resized. - * @param {{height:number, width:number}} size The size `{"height": h, "width": w}` to resize the image to. - * @param {string | 0 | 1 | 2 | 3 | 4 | 5} [resample=2] The resampling filter to use. - * @returns {Promise} The resized image. + * The reshaped input size of the image. */ - thumbnail(image: RawImage, size: { - height: number; - width: number; - }, resample?: string | 0 | 1 | 2 | 3 | 4 | 5): Promise; + reshaped_input_size: HeightWidth; /** - * Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold). - * @param {RawImage} image The image to be cropped. - * @param {number} gray_threshold Value below which pixels are considered to be gray. - * @returns {Promise} The cropped image. + * The pixel values of the preprocessed image. */ - crop_margin(image: RawImage, gray_threshold?: number): Promise; - /** - * Pad the image by a certain amount. - * @param {Float32Array} pixelData The pixel data to pad. - * @param {number[]} imgDims The dimensions of the image. - * @param {{width:number; height:number}|number} padSize The dimensions of the padded image. - * @param {Object} options The options for padding. - * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add. - * @param {boolean} [options.center=false] Whether to center the image. - * @param {number} [options.constant_values=0] The constant value to use for padding. - * @returns {[Float32Array, number[]]} The padded pixel data and image dimensions. - */ - pad_image(pixelData: Float32Array, imgDims: number[], padSize: { - width: number; - height: number; - } | number, { mode, center, constant_values, }?: { - mode?: 'constant' | 'symmetric'; - center?: boolean; - constant_values?: number; - }): [Float32Array, number[]]; - /** - * Rescale the image' pixel values by `this.rescale_factor`. - * @param {Float32Array} pixelData The pixel data to rescale. - * @returns {void} - */ - rescale(pixelData: Float32Array): void; - /** - * Find the target (width, height) dimension of the output image after - * resizing given the input image and the desired size. - * @param {RawImage} image The image to resize. - * @param {any} size The size to use for resizing the image. - * @returns {[number, number]} The target (width, height) dimension of the output image after resizing. - */ - get_resize_output_image_size(image: RawImage, size: any): [number, number]; - /** - * Resizes the image. - * @param {RawImage} image The image to resize. - * @returns {Promise} The resized image. - */ - resize(image: RawImage): Promise; - /** - * @typedef {object} PreprocessedImage - * @property {HeightWidth} original_size The original size of the image. - * @property {HeightWidth} reshaped_input_size The reshaped input size of the image. - * @property {Tensor} pixel_values The pixel values of the preprocessed image. - */ - /** - * Preprocesses the given image. - * - * @param {RawImage} image The image to preprocess. - * @param {Object} overrides The overrides for the preprocessing options. - * @returns {Promise} The preprocessed image. - */ - preprocess(image: RawImage, { do_normalize, do_pad, do_convert_rgb, do_convert_grayscale, }?: any): Promise<{ - /** - * The original size of the image. - */ - original_size: HeightWidth; - /** - * The reshaped input size of the image. - */ - reshaped_input_size: HeightWidth; - /** - * The pixel values of the preprocessed image. - */ - pixel_values: Tensor; - }>; - /** - * Calls the feature extraction process on an array of images, - * preprocesses each image, and concatenates the resulting - * features into a single Tensor. - * @param {RawImage[]} images The image(s) to extract features from. - * @param {...any} args Additional arguments. - * @returns {Promise} An object containing the concatenated pixel values (and other metadata) of the preprocessed images. - */ - _call(images: RawImage[], ...args: any[]): Promise; + pixel_values: Tensor; + }>; + /** + * Calls the feature extraction process on an array of images, + * preprocesses each image, and concatenates the resulting + * features into a single Tensor. + * @param {RawImage[]} images The image(s) to extract features from. + * @param {...any} args Additional arguments. + * @returns {Promise} An object containing the concatenated pixel values (and other metadata) of the preprocessed images. + */ + _call( + images: RawImage[], + ...args: any[] + ): Promise; } export class SegformerFeatureExtractor extends ImageFeatureExtractor { - /** - * Converts the output of `SegformerForSemanticSegmentation` into semantic segmentation maps. - * @param {*} outputs Raw outputs of the model. - * @param {number[][]} [target_sizes=null] List of tuples corresponding to the requested final size - * (height, width) of each prediction. If unset, predictions will not be resized. - * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps. - */ - post_process_semantic_segmentation(outputs: any, target_sizes?: number[][]): { - segmentation: Tensor; - labels: number[]; - }[]; -} -export class BitImageProcessor extends ImageFeatureExtractor { -} -export class DPTFeatureExtractor extends ImageFeatureExtractor { -} -export class GLPNFeatureExtractor extends ImageFeatureExtractor { -} -export class CLIPFeatureExtractor extends ImageFeatureExtractor { -} -export class ChineseCLIPFeatureExtractor extends ImageFeatureExtractor { -} -export class SiglipImageProcessor extends ImageFeatureExtractor { + /** + * Converts the output of `SegformerForSemanticSegmentation` into semantic segmentation maps. + * @param {*} outputs Raw outputs of the model. + * @param {number[][]} [target_sizes=null] List of tuples corresponding to the requested final size + * (height, width) of each prediction. If unset, predictions will not be resized. + * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps. + */ + post_process_semantic_segmentation( + outputs: any, + target_sizes?: number[][], + ): { + segmentation: Tensor; + labels: number[]; + }[]; } +export class BitImageProcessor extends ImageFeatureExtractor {} +export class DPTFeatureExtractor extends ImageFeatureExtractor {} +export class GLPNFeatureExtractor extends ImageFeatureExtractor {} +export class CLIPFeatureExtractor extends ImageFeatureExtractor {} +export class ChineseCLIPFeatureExtractor extends ImageFeatureExtractor {} +export class SiglipImageProcessor extends ImageFeatureExtractor {} export class ConvNextFeatureExtractor extends ImageFeatureExtractor { - constructor(config: any); - /** - * Percentage of the image to crop. Only has an effect if this.size < 384. - */ - crop_pct: any; - resize(image: any): Promise; -} -export class ConvNextImageProcessor extends ConvNextFeatureExtractor { -} -export class ViTFeatureExtractor extends ImageFeatureExtractor { -} -export class ViTImageProcessor extends ImageFeatureExtractor { -} -export class MobileViTFeatureExtractor extends ImageFeatureExtractor { + constructor(config: any); + /** + * Percentage of the image to crop. Only has an effect if this.size < 384. + */ + crop_pct: any; + resize(image: any): Promise; } +export class ConvNextImageProcessor extends ConvNextFeatureExtractor {} +export class ViTFeatureExtractor extends ImageFeatureExtractor {} +export class ViTImageProcessor extends ImageFeatureExtractor {} +export class MobileViTFeatureExtractor extends ImageFeatureExtractor {} export class OwlViTFeatureExtractor extends ImageFeatureExtractor { - /** - * Post-processes the outputs of the model (for object detection). - * @param {Object} outputs The outputs of the model that must be post-processed - * @param {Tensor} outputs.logits The logits - * @param {Tensor} outputs.pred_boxes The predicted boxes. - * @param {number} [threshold=0.5] The threshold to use for the scores. - * @param {number[][]} [target_sizes=null] The sizes of the original images. - * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed. - * @return {Object[]} An array of objects containing the post-processed outputs. - * @private - */ - post_process_object_detection(outputs: { - logits: Tensor; - pred_boxes: Tensor; - }, threshold?: number, target_sizes?: number[][], is_zero_shot?: boolean): any[]; -} -export class DeiTFeatureExtractor extends ImageFeatureExtractor { -} -export class BeitFeatureExtractor extends ImageFeatureExtractor { + /** + * Post-processes the outputs of the model (for object detection). + * @param {Object} outputs The outputs of the model that must be post-processed + * @param {Tensor} outputs.logits The logits + * @param {Tensor} outputs.pred_boxes The predicted boxes. + * @param {number} [threshold=0.5] The threshold to use for the scores. + * @param {number[][]} [target_sizes=null] The sizes of the original images. + * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed. + * @return {Object[]} An array of objects containing the post-processed outputs. + * @private + */ + post_process_object_detection( + outputs: { + logits: Tensor; + pred_boxes: Tensor; + }, + threshold?: number, + target_sizes?: number[][], + is_zero_shot?: boolean, + ): any[]; } +export class DeiTFeatureExtractor extends ImageFeatureExtractor {} +export class BeitFeatureExtractor extends ImageFeatureExtractor {} export class DonutFeatureExtractor extends ImageFeatureExtractor { - pad_image(pixelData: any, imgDims: any, padSize: any, options?: {}): [Float32Array, number[]]; -} -export class NougatImageProcessor extends DonutFeatureExtractor { + pad_image( + pixelData: any, + imgDims: any, + padSize: any, + options?: {}, + ): [Float32Array, number[]]; } +export class NougatImageProcessor extends DonutFeatureExtractor {} /** * @typedef {object} DetrFeatureExtractorResultProps * @property {Tensor} pixel_mask @@ -240,99 +261,138 @@ export class NougatImageProcessor extends DonutFeatureExtractor { * @extends ImageFeatureExtractor */ export class DetrFeatureExtractor extends ImageFeatureExtractor { - /** - * Calls the feature extraction process on an array of images, preprocesses - * each image, and concatenates the resulting features into a single Tensor. - * @param {RawImage[]} images The image(s) to extract features from. - * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. - */ - _call(images: RawImage[]): Promise; - /** - * Post-processes the outputs of the model (for object detection). - * @param {Object} outputs The outputs of the model that must be post-processed - * @param {Tensor} outputs.logits The logits - * @param {Tensor} outputs.pred_boxes The predicted boxes. - * @param {number} [threshold=0.5] The threshold to use for the scores. - * @param {number[][]} [target_sizes=null] The sizes of the original images. - * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed. - * @return {Object[]} An array of objects containing the post-processed outputs. - * @private - */ - post_process_object_detection(outputs: { - logits: Tensor; - pred_boxes: Tensor; - }, threshold?: number, target_sizes?: number[][], is_zero_shot?: boolean): any[]; - /** - * Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`. - * @param {Tensor} class_logits The class logits. - * @param {Tensor} mask_logits The mask logits. - * @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks. - * @param {number} num_labels The number of labels. - * @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels. - */ - remove_low_and_no_objects(class_logits: Tensor, mask_logits: Tensor, object_mask_threshold: number, num_labels: number): [Tensor[], number[], number[]]; - /** - * Checks whether the segment is valid or not. - * @param {Int32Array} mask_labels Labels for each pixel in the mask. - * @param {Tensor[]} mask_probs Probabilities for each pixel in the masks. - * @param {number} k The class id of the segment. - * @param {number} mask_threshold The mask threshold. - * @param {number} overlap_mask_area_threshold The overlap mask area threshold. - * @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels. - */ - check_segment_validity(mask_labels: Int32Array, mask_probs: Tensor[], k: number, mask_threshold?: number, overlap_mask_area_threshold?: number): [boolean, number[]]; - /** - * Computes the segments. - * @param {Tensor[]} mask_probs The mask probabilities. - * @param {number[]} pred_scores The predicted scores. - * @param {number[]} pred_labels The predicted labels. - * @param {number} mask_threshold The mask threshold. - * @param {number} overlap_mask_area_threshold The overlap mask area threshold. - * @param {Set} label_ids_to_fuse The label ids to fuse. - * @param {number[]} target_size The target size of the image. - * @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments. - */ - compute_segments(mask_probs: Tensor[], pred_scores: number[], pred_labels: number[], mask_threshold: number, overlap_mask_area_threshold: number, label_ids_to_fuse?: Set, target_size?: number[]): [Tensor, Array<{ - id: number; - label_id: number; - score: number; - }>]; - /** - * Post-process the model output to generate the final panoptic segmentation. - * @param {*} outputs The model output to post process - * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks. - * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values. - * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask. - * @param {Set} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together. - * @param {number[][]} [target_sizes=null] The target sizes to resize the masks to. - * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>} - */ - post_process_panoptic_segmentation(outputs: any, threshold?: number, mask_threshold?: number, overlap_mask_area_threshold?: number, label_ids_to_fuse?: Set, target_sizes?: number[][]): Array<{ - segmentation: Tensor; - segments_info: Array<{ - id: number; - label_id: number; - score: number; - }>; + /** + * Calls the feature extraction process on an array of images, preprocesses + * each image, and concatenates the resulting features into a single Tensor. + * @param {RawImage[]} images The image(s) to extract features from. + * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. + */ + _call(images: RawImage[]): Promise; + /** + * Post-processes the outputs of the model (for object detection). + * @param {Object} outputs The outputs of the model that must be post-processed + * @param {Tensor} outputs.logits The logits + * @param {Tensor} outputs.pred_boxes The predicted boxes. + * @param {number} [threshold=0.5] The threshold to use for the scores. + * @param {number[][]} [target_sizes=null] The sizes of the original images. + * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed. + * @return {Object[]} An array of objects containing the post-processed outputs. + * @private + */ + post_process_object_detection( + outputs: { + logits: Tensor; + pred_boxes: Tensor; + }, + threshold?: number, + target_sizes?: number[][], + is_zero_shot?: boolean, + ): any[]; + /** + * Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`. + * @param {Tensor} class_logits The class logits. + * @param {Tensor} mask_logits The mask logits. + * @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks. + * @param {number} num_labels The number of labels. + * @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels. + */ + remove_low_and_no_objects( + class_logits: Tensor, + mask_logits: Tensor, + object_mask_threshold: number, + num_labels: number, + ): [Tensor[], number[], number[]]; + /** + * Checks whether the segment is valid or not. + * @param {Int32Array} mask_labels Labels for each pixel in the mask. + * @param {Tensor[]} mask_probs Probabilities for each pixel in the masks. + * @param {number} k The class id of the segment. + * @param {number} mask_threshold The mask threshold. + * @param {number} overlap_mask_area_threshold The overlap mask area threshold. + * @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels. + */ + check_segment_validity( + mask_labels: Int32Array, + mask_probs: Tensor[], + k: number, + mask_threshold?: number, + overlap_mask_area_threshold?: number, + ): [boolean, number[]]; + /** + * Computes the segments. + * @param {Tensor[]} mask_probs The mask probabilities. + * @param {number[]} pred_scores The predicted scores. + * @param {number[]} pred_labels The predicted labels. + * @param {number} mask_threshold The mask threshold. + * @param {number} overlap_mask_area_threshold The overlap mask area threshold. + * @param {Set} label_ids_to_fuse The label ids to fuse. + * @param {number[]} target_size The target size of the image. + * @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments. + */ + compute_segments( + mask_probs: Tensor[], + pred_scores: number[], + pred_labels: number[], + mask_threshold: number, + overlap_mask_area_threshold: number, + label_ids_to_fuse?: Set, + target_size?: number[], + ): [ + Tensor, + Array<{ + id: number; + label_id: number; + score: number; + }>, + ]; + /** + * Post-process the model output to generate the final panoptic segmentation. + * @param {*} outputs The model output to post process + * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks. + * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values. + * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask. + * @param {Set} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together. + * @param {number[][]} [target_sizes=null] The target sizes to resize the masks to. + * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>} + */ + post_process_panoptic_segmentation( + outputs: any, + threshold?: number, + mask_threshold?: number, + overlap_mask_area_threshold?: number, + label_ids_to_fuse?: Set, + target_sizes?: number[][], + ): Array<{ + segmentation: Tensor; + segments_info: Array<{ + id: number; + label_id: number; + score: number; }>; - post_process_instance_segmentation(): void; + }>; + post_process_instance_segmentation(): void; } export class YolosFeatureExtractor extends ImageFeatureExtractor { - /** - * Post-processes the outputs of the model (for object detection). - * @param {Object} outputs The outputs of the model that must be post-processed - * @param {Tensor} outputs.logits The logits - * @param {Tensor} outputs.pred_boxes The predicted boxes. - * @param {number} [threshold=0.5] The threshold to use for the scores. - * @param {number[][]} [target_sizes=null] The sizes of the original images. - * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed. - * @return {Object[]} An array of objects containing the post-processed outputs. - * @private - */ - post_process_object_detection(outputs: { - logits: Tensor; - pred_boxes: Tensor; - }, threshold?: number, target_sizes?: number[][], is_zero_shot?: boolean): any[]; + /** + * Post-processes the outputs of the model (for object detection). + * @param {Object} outputs The outputs of the model that must be post-processed + * @param {Tensor} outputs.logits The logits + * @param {Tensor} outputs.pred_boxes The predicted boxes. + * @param {number} [threshold=0.5] The threshold to use for the scores. + * @param {number[][]} [target_sizes=null] The sizes of the original images. + * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed. + * @return {Object[]} An array of objects containing the post-processed outputs. + * @private + */ + post_process_object_detection( + outputs: { + logits: Tensor; + pred_boxes: Tensor; + }, + threshold?: number, + target_sizes?: number[][], + is_zero_shot?: boolean, + ): any[]; } /** * @typedef {object} SamImageProcessorResult @@ -343,258 +403,298 @@ export class YolosFeatureExtractor extends ImageFeatureExtractor { * @property {Tensor} [input_labels] */ export class SamImageProcessor extends ImageFeatureExtractor { - /** - * - * @param {any} input_points - * @param {HeightWidth[]} original_sizes - * @param {HeightWidth[]} reshaped_input_sizes - * @returns {Tensor} - */ - reshape_input_points(input_points: any, original_sizes: HeightWidth[], reshaped_input_sizes: HeightWidth[]): Tensor; - /** - * - * @param {any} input_labels - * @param {Tensor} input_points - * @returns {Tensor} - */ - add_input_labels(input_labels: any, input_points: Tensor): Tensor; - /** - * @param {any[]} images The URL(s) of the image(s) to extract features from. - * @param {any} [input_points] A 3D or 4D array, representing the input points provided by the user. - * - 3D: `[point_batch_size, nb_points_per_image, 2]`. In this case, `batch_size` is assumed to be 1. - * - 4D: `[batch_size, point_batch_size, nb_points_per_image, 2]`. - * @param {any} [input_labels] A 2D or 3D array, representing the input labels for the points, used by the prompt encoder to encode the prompt. - * - 2D: `[point_batch_size, nb_points_per_image]`. In this case, `batch_size` is assumed to be 1. - * - 3D: `[batch_size, point_batch_size, nb_points_per_image]`. - * @returns {Promise} - */ - _call(images: any[], input_points?: any, input_labels?: any): Promise; - /** - * Remove padding and upscale masks to the original image size. - * @param {Tensor} masks Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format. - * @param {number[][]} original_sizes The original sizes of each image before it was resized to the model's expected input shape, in (height, width) format. - * @param {number[][]} reshaped_input_sizes The size of each image as it is fed to the model, in (height, width) format. Used to remove padding. - * @param {Object} options Optional parameters for post-processing. - * @param {number} [options.mask_threshold] The threshold to use for binarizing the masks. - * @param {boolean} [options.binarize] Whether to binarize the masks. - * @param {Object} [options.pad_size] The target size the images were padded to before being passed to the model. If `null`, the target size is assumed to be the processor's `pad_size`. - * @param {number} [options.pad_size.height] The height the images were padded to. - * @param {number} [options.pad_size.width] The width the images were padded to. - * @returns {Tensor[]} Batched masks in batch_size, num_channels, height, width) format, where (height, width) is given by original_size. - */ - post_process_masks(masks: Tensor, original_sizes: number[][], reshaped_input_sizes: number[][], { mask_threshold, binarize, pad_size, }?: { - mask_threshold?: number; - binarize?: boolean; - pad_size?: { - height?: number; - width?: number; - }; - }): Tensor[]; + /** + * + * @param {any} input_points + * @param {HeightWidth[]} original_sizes + * @param {HeightWidth[]} reshaped_input_sizes + * @returns {Tensor} + */ + reshape_input_points( + input_points: any, + original_sizes: HeightWidth[], + reshaped_input_sizes: HeightWidth[], + ): Tensor; + /** + * + * @param {any} input_labels + * @param {Tensor} input_points + * @returns {Tensor} + */ + add_input_labels(input_labels: any, input_points: Tensor): Tensor; + /** + * @param {any[]} images The URL(s) of the image(s) to extract features from. + * @param {any} [input_points] A 3D or 4D array, representing the input points provided by the user. + * - 3D: `[point_batch_size, nb_points_per_image, 2]`. In this case, `batch_size` is assumed to be 1. + * - 4D: `[batch_size, point_batch_size, nb_points_per_image, 2]`. + * @param {any} [input_labels] A 2D or 3D array, representing the input labels for the points, used by the prompt encoder to encode the prompt. + * - 2D: `[point_batch_size, nb_points_per_image]`. In this case, `batch_size` is assumed to be 1. + * - 3D: `[batch_size, point_batch_size, nb_points_per_image]`. + * @returns {Promise} + */ + _call( + images: any[], + input_points?: any, + input_labels?: any, + ): Promise; + /** + * Remove padding and upscale masks to the original image size. + * @param {Tensor} masks Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format. + * @param {number[][]} original_sizes The original sizes of each image before it was resized to the model's expected input shape, in (height, width) format. + * @param {number[][]} reshaped_input_sizes The size of each image as it is fed to the model, in (height, width) format. Used to remove padding. + * @param {Object} options Optional parameters for post-processing. + * @param {number} [options.mask_threshold] The threshold to use for binarizing the masks. + * @param {boolean} [options.binarize] Whether to binarize the masks. + * @param {Object} [options.pad_size] The target size the images were padded to before being passed to the model. If `null`, the target size is assumed to be the processor's `pad_size`. + * @param {number} [options.pad_size.height] The height the images were padded to. + * @param {number} [options.pad_size.width] The width the images were padded to. + * @returns {Tensor[]} Batched masks in batch_size, num_channels, height, width) format, where (height, width) is given by original_size. + */ + post_process_masks( + masks: Tensor, + original_sizes: number[][], + reshaped_input_sizes: number[][], + { + mask_threshold, + binarize, + pad_size, + }?: { + mask_threshold?: number; + binarize?: boolean; + pad_size?: { + height?: number; + width?: number; + }; + }, + ): Tensor[]; } export class Swin2SRImageProcessor extends ImageFeatureExtractor { - pad_image(pixelData: any, imgDims: any, padSize: any, options?: {}): [Float32Array, number[]]; + pad_image( + pixelData: any, + imgDims: any, + padSize: any, + options?: {}, + ): [Float32Array, number[]]; } export class VitMatteImageProcessor extends ImageFeatureExtractor { - /** - * Calls the feature extraction process on an array of images, preprocesses - * each image, and concatenates the resulting features into a single Tensor. - * @param {RawImage[]} images The image(s) to extract features from. - * @param {RawImage[]} trimaps The trimaps(s) to extract features from. - * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. - */ - _call(images: RawImage[], trimaps: RawImage[]): Promise; + /** + * Calls the feature extraction process on an array of images, preprocesses + * each image, and concatenates the resulting features into a single Tensor. + * @param {RawImage[]} images The image(s) to extract features from. + * @param {RawImage[]} trimaps The trimaps(s) to extract features from. + * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. + */ + _call( + images: RawImage[], + trimaps: RawImage[], + ): Promise; } export class WhisperFeatureExtractor extends FeatureExtractor { - constructor(config: any); - window: Float64Array; - /** - * Computes the log-Mel spectrogram of the provided audio waveform. - * @param {Float32Array|Float64Array} waveform The audio waveform to process. - * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. - */ - _extract_fbank_features(waveform: Float32Array | Float64Array): { - data: Float32Array; - dims: number[]; - }; - /** - * Asynchronously extracts features from a given audio using the provided configuration. - * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. - * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. - */ - _call(audio: Float32Array | Float64Array): Promise<{ - input_features: Tensor; - }>; + constructor(config: any); + window: Float64Array; + /** + * Computes the log-Mel spectrogram of the provided audio waveform. + * @param {Float32Array|Float64Array} waveform The audio waveform to process. + * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. + */ + _extract_fbank_features(waveform: Float32Array | Float64Array): { + data: Float32Array; + dims: number[]; + }; + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. + */ + _call(audio: Float32Array | Float64Array): Promise<{ + input_features: Tensor; + }>; } export class Wav2Vec2FeatureExtractor extends FeatureExtractor { - /** - * @param {Float32Array} input_values - * @returns {Float32Array} - */ - _zero_mean_unit_var_norm(input_values: Float32Array): Float32Array; - /** - * Asynchronously extracts features from a given audio using the provided configuration. - * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. - * @returns {Promise<{ input_values: Tensor; attention_mask: Tensor }>} A Promise resolving to an object containing the extracted input features and attention mask as Tensors. - */ - _call(audio: Float32Array | Float64Array): Promise<{ - input_values: Tensor; - attention_mask: Tensor; - }>; + /** + * @param {Float32Array} input_values + * @returns {Float32Array} + */ + _zero_mean_unit_var_norm(input_values: Float32Array): Float32Array; + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_values: Tensor; attention_mask: Tensor }>} A Promise resolving to an object containing the extracted input features and attention mask as Tensors. + */ + _call(audio: Float32Array | Float64Array): Promise<{ + input_values: Tensor; + attention_mask: Tensor; + }>; } export class ASTFeatureExtractor extends FeatureExtractor { - constructor(config: any); - mel_filters: number[][]; - window: Float64Array; - mean: any; - std: any; - /** - * Computes the log-Mel spectrogram of the provided audio waveform. - * @param {Float32Array|Float64Array} waveform The audio waveform to process. - * @param {number} max_length The maximum number of frames to return. - * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. - */ - _extract_fbank_features(waveform: Float32Array | Float64Array, max_length: number): { - data: Float32Array; - dims: number[]; - }; - /** - * Asynchronously extracts features from a given audio using the provided configuration. - * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. - * @returns {Promise<{ input_values: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. - */ - _call(audio: Float32Array | Float64Array): Promise<{ - input_values: Tensor; - }>; + constructor(config: any); + mel_filters: number[][]; + window: Float64Array; + mean: any; + std: any; + /** + * Computes the log-Mel spectrogram of the provided audio waveform. + * @param {Float32Array|Float64Array} waveform The audio waveform to process. + * @param {number} max_length The maximum number of frames to return. + * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. + */ + _extract_fbank_features( + waveform: Float32Array | Float64Array, + max_length: number, + ): { + data: Float32Array; + dims: number[]; + }; + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_values: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. + */ + _call(audio: Float32Array | Float64Array): Promise<{ + input_values: Tensor; + }>; } export class ClapFeatureExtractor extends FeatureExtractor { - constructor(config: any); - mel_filters: number[][]; - mel_filters_slaney: number[][]; - window: Float64Array; - /** - * Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments. - * - * Four different path are possible: - * - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram - * will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram - * are then stacked together. They will later be used for `feature_fusion`. - * - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is - * padded based on `padding`. - * - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded - * based on `padding`, and is repeated `4` times. - * - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel - * spectrogram will be computed on a random crop of the waveform. - * - * @param {Float32Array|Float64Array} waveform The input waveform. - * @param {number} max_length The maximum length of the waveform. - * @param {string} truncation The truncation strategy to use. - * @param {string} padding The padding strategy to use. - * @returns {{ data: Float32Array; dims: number[]; longer: boolean; }} An object containing the mel spectrogram data as a Float32Array, its dimensions as an array of numbers, and a boolean indicating whether the waveform was longer than the max length. - */ - _get_input_mel(waveform: Float32Array | Float64Array, max_length: number, truncation: string, padding: string): { - data: Float32Array; - dims: number[]; - longer: boolean; - }; - /** - * Compute the log-mel spectrogram of the provided `waveform` using the Hann window. - * In CLAP, two different filter banks are used depending on the truncation pattern: - * - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from - * calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation` - * is set to `"fusion"`. - * - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used - * `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original - * implementation when the truncation mode is not `"fusion"`. - * - * @param {Float32Array|Float64Array} waveform The audio waveform to process. - * @param {number[][]} mel_filters The mel filters to use. - * @param {number} [max_length=null] The maximum number of frames to return. - * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. - */ - _extract_fbank_features(waveform: Float32Array | Float64Array, mel_filters: number[][], max_length?: number): { - data: Float32Array; - dims: number[]; - }; - /** - * Asynchronously extracts features from a given audio using the provided configuration. - * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. - * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. - */ - _call(audio: Float32Array | Float64Array, { max_length, }?: { - max_length?: any; - }): Promise<{ - input_features: Tensor; - }>; -} -export class SpeechT5FeatureExtractor extends FeatureExtractor { + constructor(config: any); + mel_filters: number[][]; + mel_filters_slaney: number[][]; + window: Float64Array; + /** + * Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments. + * + * Four different path are possible: + * - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram + * will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram + * are then stacked together. They will later be used for `feature_fusion`. + * - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is + * padded based on `padding`. + * - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded + * based on `padding`, and is repeated `4` times. + * - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel + * spectrogram will be computed on a random crop of the waveform. + * + * @param {Float32Array|Float64Array} waveform The input waveform. + * @param {number} max_length The maximum length of the waveform. + * @param {string} truncation The truncation strategy to use. + * @param {string} padding The padding strategy to use. + * @returns {{ data: Float32Array; dims: number[]; longer: boolean; }} An object containing the mel spectrogram data as a Float32Array, its dimensions as an array of numbers, and a boolean indicating whether the waveform was longer than the max length. + */ + _get_input_mel( + waveform: Float32Array | Float64Array, + max_length: number, + truncation: string, + padding: string, + ): { + data: Float32Array; + dims: number[]; + longer: boolean; + }; + /** + * Compute the log-mel spectrogram of the provided `waveform` using the Hann window. + * In CLAP, two different filter banks are used depending on the truncation pattern: + * - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from + * calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation` + * is set to `"fusion"`. + * - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used + * `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original + * implementation when the truncation mode is not `"fusion"`. + * + * @param {Float32Array|Float64Array} waveform The audio waveform to process. + * @param {number[][]} mel_filters The mel filters to use. + * @param {number} [max_length=null] The maximum number of frames to return. + * @returns {{data: Float32Array, dims: number[]}} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. + */ + _extract_fbank_features( + waveform: Float32Array | Float64Array, + mel_filters: number[][], + max_length?: number, + ): { + data: Float32Array; + dims: number[]; + }; + /** + * Asynchronously extracts features from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_features: Tensor }>} A Promise resolving to an object containing the extracted input features as a Tensor. + */ + _call( + audio: Float32Array | Float64Array, + { + max_length, + }?: { + max_length?: any; + }, + ): Promise<{ + input_features: Tensor; + }>; } +export class SpeechT5FeatureExtractor extends FeatureExtractor {} declare const Processor_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * Represents a Processor that extracts features from an input. * @extends Callable */ export class Processor extends Processor_base { - /** - * Creates a new Processor with the given feature extractor. - * @param {FeatureExtractor} feature_extractor The function used to extract features from the input. - */ - constructor(feature_extractor: FeatureExtractor); - feature_extractor: FeatureExtractor; - /** - * Calls the feature_extractor function with the given input. - * @param {any} input The input to extract features from. - * @param {...any} args Additional arguments. - * @returns {Promise} A Promise that resolves with the extracted features. - */ - _call(input: any, ...args: any[]): Promise; + /** + * Creates a new Processor with the given feature extractor. + * @param {FeatureExtractor} feature_extractor The function used to extract features from the input. + */ + constructor(feature_extractor: FeatureExtractor); + feature_extractor: FeatureExtractor; + /** + * Calls the feature_extractor function with the given input. + * @param {any} input The input to extract features from. + * @param {...any} args Additional arguments. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + _call(input: any, ...args: any[]): Promise; } export class SamProcessor extends Processor { - /** - * @borrows SamImageProcessor#_call as _call - */ - _call(...args: any[]): Promise; - /** - * @borrows SamImageProcessor#post_process_masks as post_process_masks - */ - post_process_masks(...args: any[]): any; - /** - * @borrows SamImageProcessor#reshape_input_points as reshape_input_points - */ - reshape_input_points(...args: any[]): any; + /** + * @borrows SamImageProcessor#_call as _call + */ + _call(...args: any[]): Promise; + /** + * @borrows SamImageProcessor#post_process_masks as post_process_masks + */ + post_process_masks(...args: any[]): any; + /** + * @borrows SamImageProcessor#reshape_input_points as reshape_input_points + */ + reshape_input_points(...args: any[]): any; } /** * Represents a WhisperProcessor that extracts features from an audio input. * @extends Processor */ export class WhisperProcessor extends Processor { - /** - * Calls the feature_extractor function with the given audio input. - * @param {any} audio The audio input to extract features from. - * @returns {Promise} A Promise that resolves with the extracted features. - */ - _call(audio: any): Promise; + /** + * Calls the feature_extractor function with the given audio input. + * @param {any} audio The audio input to extract features from. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + _call(audio: any): Promise; } export class Wav2Vec2ProcessorWithLM extends Processor { - /** - * Calls the feature_extractor function with the given audio input. - * @param {any} audio The audio input to extract features from. - * @returns {Promise} A Promise that resolves with the extracted features. - */ - _call(audio: any): Promise; + /** + * Calls the feature_extractor function with the given audio input. + * @param {any} audio The audio input to extract features from. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + _call(audio: any): Promise; } export class SpeechT5Processor extends Processor { - /** - * Calls the feature_extractor function with the given input. - * @param {any} input The input to extract features from. - * @returns {Promise} A Promise that resolves with the extracted features. - */ - _call(input: any): Promise; -} -export class OwlViTProcessor extends Processor { + /** + * Calls the feature_extractor function with the given input. + * @param {any} input The input to extract features from. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + _call(input: any): Promise; } +export class OwlViTProcessor extends Processor {} /** * Helper class which is used to instantiate pretrained processors with the `from_pretrained` function. * The chosen processor class is determined by the type specified in the processor config. @@ -626,58 +726,67 @@ export class OwlViTProcessor extends Processor { * ``` */ export class AutoProcessor { - static FEATURE_EXTRACTOR_CLASS_MAPPING: { - WhisperFeatureExtractor: typeof WhisperFeatureExtractor; - ViTFeatureExtractor: typeof ViTFeatureExtractor; - MobileViTFeatureExtractor: typeof MobileViTFeatureExtractor; - OwlViTFeatureExtractor: typeof OwlViTFeatureExtractor; - CLIPFeatureExtractor: typeof CLIPFeatureExtractor; - ChineseCLIPFeatureExtractor: typeof ChineseCLIPFeatureExtractor; - SiglipImageProcessor: typeof SiglipImageProcessor; - ConvNextFeatureExtractor: typeof ConvNextFeatureExtractor; - ConvNextImageProcessor: typeof ConvNextImageProcessor; - SegformerFeatureExtractor: typeof SegformerFeatureExtractor; - BitImageProcessor: typeof BitImageProcessor; - DPTFeatureExtractor: typeof DPTFeatureExtractor; - GLPNFeatureExtractor: typeof GLPNFeatureExtractor; - BeitFeatureExtractor: typeof BeitFeatureExtractor; - DeiTFeatureExtractor: typeof DeiTFeatureExtractor; - DetrFeatureExtractor: typeof DetrFeatureExtractor; - YolosFeatureExtractor: typeof YolosFeatureExtractor; - DonutFeatureExtractor: typeof DonutFeatureExtractor; - NougatImageProcessor: typeof NougatImageProcessor; - ViTImageProcessor: typeof ViTImageProcessor; - VitMatteImageProcessor: typeof VitMatteImageProcessor; - SamImageProcessor: typeof SamImageProcessor; - Swin2SRImageProcessor: typeof Swin2SRImageProcessor; - Wav2Vec2FeatureExtractor: typeof Wav2Vec2FeatureExtractor; - SpeechT5FeatureExtractor: typeof SpeechT5FeatureExtractor; - ASTFeatureExtractor: typeof ASTFeatureExtractor; - ClapFeatureExtractor: typeof ClapFeatureExtractor; - }; - static PROCESSOR_CLASS_MAPPING: { - WhisperProcessor: typeof WhisperProcessor; - Wav2Vec2ProcessorWithLM: typeof Wav2Vec2ProcessorWithLM; - SamProcessor: typeof SamProcessor; - SpeechT5Processor: typeof SpeechT5Processor; - OwlViTProcessor: typeof OwlViTProcessor; - }; - /** - * Instantiate one of the processor classes of the library from a pretrained model. - * - * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object - * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) - * - * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: - * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co. - * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a - * user or organization name, like `dbmdz/bert-base-german-cased`. - * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`. - * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the processor. - * - * @returns {Promise} A new instance of the Processor class. - */ - static from_pretrained(pretrained_model_name_or_path: string, { progress_callback, config, cache_dir, local_files_only, revision, }?: import('./utils/hub.js').PretrainedOptions): Promise; + static FEATURE_EXTRACTOR_CLASS_MAPPING: { + WhisperFeatureExtractor: typeof WhisperFeatureExtractor; + ViTFeatureExtractor: typeof ViTFeatureExtractor; + MobileViTFeatureExtractor: typeof MobileViTFeatureExtractor; + OwlViTFeatureExtractor: typeof OwlViTFeatureExtractor; + CLIPFeatureExtractor: typeof CLIPFeatureExtractor; + ChineseCLIPFeatureExtractor: typeof ChineseCLIPFeatureExtractor; + SiglipImageProcessor: typeof SiglipImageProcessor; + ConvNextFeatureExtractor: typeof ConvNextFeatureExtractor; + ConvNextImageProcessor: typeof ConvNextImageProcessor; + SegformerFeatureExtractor: typeof SegformerFeatureExtractor; + BitImageProcessor: typeof BitImageProcessor; + DPTFeatureExtractor: typeof DPTFeatureExtractor; + GLPNFeatureExtractor: typeof GLPNFeatureExtractor; + BeitFeatureExtractor: typeof BeitFeatureExtractor; + DeiTFeatureExtractor: typeof DeiTFeatureExtractor; + DetrFeatureExtractor: typeof DetrFeatureExtractor; + YolosFeatureExtractor: typeof YolosFeatureExtractor; + DonutFeatureExtractor: typeof DonutFeatureExtractor; + NougatImageProcessor: typeof NougatImageProcessor; + ViTImageProcessor: typeof ViTImageProcessor; + VitMatteImageProcessor: typeof VitMatteImageProcessor; + SamImageProcessor: typeof SamImageProcessor; + Swin2SRImageProcessor: typeof Swin2SRImageProcessor; + Wav2Vec2FeatureExtractor: typeof Wav2Vec2FeatureExtractor; + SpeechT5FeatureExtractor: typeof SpeechT5FeatureExtractor; + ASTFeatureExtractor: typeof ASTFeatureExtractor; + ClapFeatureExtractor: typeof ClapFeatureExtractor; + }; + static PROCESSOR_CLASS_MAPPING: { + WhisperProcessor: typeof WhisperProcessor; + Wav2Vec2ProcessorWithLM: typeof Wav2Vec2ProcessorWithLM; + SamProcessor: typeof SamProcessor; + SpeechT5Processor: typeof SpeechT5Processor; + OwlViTProcessor: typeof OwlViTProcessor; + }; + /** + * Instantiate one of the processor classes of the library from a pretrained model. + * + * The processor class to instantiate is selected based on the `feature_extractor_type` property of the config object + * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) + * + * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: + * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co. + * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + * user or organization name, like `dbmdz/bert-base-german-cased`. + * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`. + * @param {import('./utils/hub.js').PretrainedOptions} options Additional options for loading the processor. + * + * @returns {Promise} A new instance of the Processor class. + */ + static from_pretrained( + pretrained_model_name_or_path: string, + { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + }?: import("./utils/hub.js").PretrainedOptions, + ): Promise; } /** * Named tuple to indicate the order we are using is (height x width), even though @@ -685,31 +794,32 @@ export class AutoProcessor { */ export type HeightWidth = [height: number, width: number]; export type ImageFeatureExtractorResult = { - /** - * The pixel values of the batched preprocessed images. - */ - pixel_values: Tensor; - /** - * Array of two-dimensional tuples like [[480, 640]]. - */ - original_sizes: HeightWidth[]; - /** - * Array of two-dimensional tuples like [[1000, 1330]]. - */ - reshaped_input_sizes: HeightWidth[]; + /** + * The pixel values of the batched preprocessed images. + */ + pixel_values: Tensor; + /** + * Array of two-dimensional tuples like [[480, 640]]. + */ + original_sizes: HeightWidth[]; + /** + * Array of two-dimensional tuples like [[1000, 1330]]. + */ + reshaped_input_sizes: HeightWidth[]; }; export type DetrFeatureExtractorResultProps = { - pixel_mask: Tensor; + pixel_mask: Tensor; }; -export type DetrFeatureExtractorResult = ImageFeatureExtractorResult & DetrFeatureExtractorResultProps; +export type DetrFeatureExtractorResult = ImageFeatureExtractorResult & + DetrFeatureExtractorResultProps; export type SamImageProcessorResult = { - pixel_values: Tensor; - original_sizes: HeightWidth[]; - reshaped_input_sizes: HeightWidth[]; - input_points?: Tensor; - input_labels?: Tensor; + pixel_values: Tensor; + original_sizes: HeightWidth[]; + reshaped_input_sizes: HeightWidth[]; + input_points?: Tensor; + input_labels?: Tensor; }; -import { RawImage } from './utils/image.js'; -import { Tensor } from './utils/tensor.js'; +import { RawImage } from "./utils/image.js"; +import { Tensor } from "./utils/tensor.js"; export {}; -//# sourceMappingURL=processors.d.ts.map \ No newline at end of file +//# sourceMappingURL=processors.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/tokenizers.d.ts b/core/vendor/modules/@xenova/transformers/types/tokenizers.d.ts index b33c028b0..412bbdc5b 100644 --- a/core/vendor/modules/@xenova/transformers/types/tokenizers.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/tokenizers.d.ts @@ -1,6 +1,6 @@ declare const TokenizerModel_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * Abstract base class for tokenizer models. @@ -8,375 +8,402 @@ declare const TokenizerModel_base: new () => { * @extends Callable */ export class TokenizerModel extends TokenizerModel_base { - /** - * Instantiates a new TokenizerModel instance based on the configuration object provided. - * @param {Object} config The configuration object for the TokenizerModel. - * @param {...*} args Optional arguments to pass to the specific TokenizerModel constructor. - * @returns {TokenizerModel} A new instance of a TokenizerModel. - * @throws Will throw an error if the TokenizerModel type in the config is not recognized. - */ - static fromConfig(config: any, ...args: any[]): TokenizerModel; - /** - * Creates a new instance of TokenizerModel. - * @param {Object} config The configuration object for the TokenizerModel. - */ - constructor(config: any); - config: any; - /** @type {string[]} */ - vocab: string[]; - /** - * A mapping of tokens to ids. - * @type {Map} - */ - tokens_to_ids: Map; - unk_token_id: any; - unk_token: any; - end_of_word_suffix: any; - /** @type {boolean} Whether to fuse unknown tokens when encoding. Defaults to false. */ - fuse_unk: boolean; - /** - * Internal function to call the TokenizerModel instance. - * @param {string[]} tokens The tokens to encode. - * @returns {string[]} The encoded token IDs. - */ - _call(tokens: string[]): string[]; - /** - * Encodes a list of tokens into a list of token IDs. - * @param {string[]} tokens The tokens to encode. - * @returns {string[]} The encoded tokens. - * @throws Will throw an error if not implemented in a subclass. - */ - encode(tokens: string[]): string[]; - /** - * Converts a list of tokens into a list of token IDs. - * @param {string[]} tokens The tokens to convert. - * @returns {number[]} The converted token IDs. - */ - convert_tokens_to_ids(tokens: string[]): number[]; - /** - * Converts a list of token IDs into a list of tokens. - * @param {number[]} ids The token IDs to convert. - * @returns {string[]} The converted tokens. - */ - convert_ids_to_tokens(ids: number[]): string[]; + /** + * Instantiates a new TokenizerModel instance based on the configuration object provided. + * @param {Object} config The configuration object for the TokenizerModel. + * @param {...*} args Optional arguments to pass to the specific TokenizerModel constructor. + * @returns {TokenizerModel} A new instance of a TokenizerModel. + * @throws Will throw an error if the TokenizerModel type in the config is not recognized. + */ + static fromConfig(config: any, ...args: any[]): TokenizerModel; + /** + * Creates a new instance of TokenizerModel. + * @param {Object} config The configuration object for the TokenizerModel. + */ + constructor(config: any); + config: any; + /** @type {string[]} */ + vocab: string[]; + /** + * A mapping of tokens to ids. + * @type {Map} + */ + tokens_to_ids: Map; + unk_token_id: any; + unk_token: any; + end_of_word_suffix: any; + /** @type {boolean} Whether to fuse unknown tokens when encoding. Defaults to false. */ + fuse_unk: boolean; + /** + * Internal function to call the TokenizerModel instance. + * @param {string[]} tokens The tokens to encode. + * @returns {string[]} The encoded token IDs. + */ + _call(tokens: string[]): string[]; + /** + * Encodes a list of tokens into a list of token IDs. + * @param {string[]} tokens The tokens to encode. + * @returns {string[]} The encoded tokens. + * @throws Will throw an error if not implemented in a subclass. + */ + encode(tokens: string[]): string[]; + /** + * Converts a list of tokens into a list of token IDs. + * @param {string[]} tokens The tokens to convert. + * @returns {number[]} The converted token IDs. + */ + convert_tokens_to_ids(tokens: string[]): number[]; + /** + * Converts a list of token IDs into a list of tokens. + * @param {number[]} ids The token IDs to convert. + * @returns {string[]} The converted tokens. + */ + convert_ids_to_tokens(ids: number[]): string[]; } declare const PreTrainedTokenizer_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; export class PreTrainedTokenizer extends PreTrainedTokenizer_base { + /** + * Loads a pre-trained tokenizer from the given `pretrained_model_name_or_path`. + * + * @param {string} pretrained_model_name_or_path The path to the pre-trained tokenizer. + * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. + * + * @throws {Error} Throws an error if the tokenizer.json or tokenizer_config.json files are not found in the `pretrained_model_name_or_path`. + * @returns {Promise} A new instance of the `PreTrainedTokenizer` class. + */ + static from_pretrained( + pretrained_model_name_or_path: string, + { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + legacy, + }?: PretrainedTokenizerOptions, + ): Promise; + /** + * Create a new PreTrainedTokenizer instance. + * @param {Object} tokenizerJSON The JSON of the tokenizer. + * @param {Object} tokenizerConfig The config of the tokenizer. + */ + constructor(tokenizerJSON: any, tokenizerConfig: any); + return_token_type_ids: boolean; + _default_chat_template: string; + _tokenizer_config: any; + normalizer: Normalizer; + pre_tokenizer: PreTokenizer; + model: TokenizerModel; + post_processor: PostProcessor; + decoder: Decoder; + special_tokens: any[]; + all_special_ids: number[]; + /** @type {AddedToken[]} */ + added_tokens: AddedToken[]; + additional_special_tokens: any; + added_tokens_regex: RegExp; + mask_token: string; + mask_token_id: number; + pad_token: string; + pad_token_id: number; + sep_token: string; + sep_token_id: number; + unk_token: string; + unk_token_id: number; + model_max_length: any; + /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */ + remove_space: boolean; + clean_up_tokenization_spaces: any; + do_lowercase_and_remove_accent: any; + /** @type {'right'|'left'} */ + padding_side: "right" | "left"; + legacy: boolean; + chat_template: any; + _compiled_template_cache: Map; + /** + * Returns the value of the first matching key in the tokenizer config object. + * @param {...string} keys One or more keys to search for in the tokenizer config object. + * @returns {string|null} The value associated with the first matching key, or null if no match is found. + * @throws {Error} If an object is found for a matching key and its __type property is not "AddedToken". + */ + getToken(...keys: string[]): string | null; + /** + * @typedef {number[]|number[][]|Tensor} BatchEncodingItem + * + * @typedef {Object} BatchEncoding Holds the output of the tokenizer's call function. + * @property {BatchEncodingItem} input_ids List of token ids to be fed to a model. + * @property {BatchEncodingItem} attention_mask List of indices specifying which tokens should be attended to by the model. + * @property {BatchEncodingItem} [token_type_ids] List of token type ids to be fed to a model. + */ + /** + * Encode/tokenize the given text(s). + * @param {string|string[]} text The text to tokenize. + * @param {Object} options An optional object containing the following properties: + * @param {string|string[]} [options.text_pair=null] Optional second sequence to be encoded. If set, must be the same type as text. + * @param {boolean|'max_length'} [options.padding=false] Whether to pad the input sequences. + * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. + * @param {boolean} [options.truncation=null] Whether to truncate the input sequences. + * @param {number} [options.max_length=null] Maximum length of the returned list and optionally padding length. + * @param {boolean} [options.return_tensor=true] Whether to return the results as Tensors or arrays. + * @returns {BatchEncoding} Object to be passed to the model. + */ + _call( + text: string | string[], + { + text_pair, + add_special_tokens, + padding, + truncation, + max_length, + return_tensor, + }?: { + text_pair?: string | string[]; + padding?: boolean | "max_length"; + add_special_tokens?: boolean; + truncation?: boolean; + max_length?: number; + return_tensor?: boolean; + }, + ): { /** - * Loads a pre-trained tokenizer from the given `pretrained_model_name_or_path`. - * - * @param {string} pretrained_model_name_or_path The path to the pre-trained tokenizer. - * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. - * - * @throws {Error} Throws an error if the tokenizer.json or tokenizer_config.json files are not found in the `pretrained_model_name_or_path`. - * @returns {Promise} A new instance of the `PreTrainedTokenizer` class. + * List of token ids to be fed to a model. */ - static from_pretrained(pretrained_model_name_or_path: string, { progress_callback, config, cache_dir, local_files_only, revision, legacy, }?: PretrainedTokenizerOptions): Promise; + input_ids: number[] | Tensor | number[][]; /** - * Create a new PreTrainedTokenizer instance. - * @param {Object} tokenizerJSON The JSON of the tokenizer. - * @param {Object} tokenizerConfig The config of the tokenizer. + * List of indices specifying which tokens should be attended to by the model. */ - constructor(tokenizerJSON: any, tokenizerConfig: any); - return_token_type_ids: boolean; - _default_chat_template: string; - _tokenizer_config: any; - normalizer: Normalizer; - pre_tokenizer: PreTokenizer; - model: TokenizerModel; - post_processor: PostProcessor; - decoder: Decoder; - special_tokens: any[]; - all_special_ids: number[]; - /** @type {AddedToken[]} */ - added_tokens: AddedToken[]; - additional_special_tokens: any; - added_tokens_regex: RegExp; - mask_token: string; - mask_token_id: number; - pad_token: string; - pad_token_id: number; - sep_token: string; - sep_token_id: number; - unk_token: string; - unk_token_id: number; - model_max_length: any; - /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */ - remove_space: boolean; - clean_up_tokenization_spaces: any; - do_lowercase_and_remove_accent: any; - /** @type {'right'|'left'} */ - padding_side: 'right' | 'left'; - legacy: boolean; - chat_template: any; - _compiled_template_cache: Map; + attention_mask: number[] | Tensor | number[][]; /** - * Returns the value of the first matching key in the tokenizer config object. - * @param {...string} keys One or more keys to search for in the tokenizer config object. - * @returns {string|null} The value associated with the first matching key, or null if no match is found. - * @throws {Error} If an object is found for a matching key and its __type property is not "AddedToken". + * List of token type ids to be fed to a model. */ - getToken(...keys: string[]): string | null; - /** - * @typedef {number[]|number[][]|Tensor} BatchEncodingItem - * - * @typedef {Object} BatchEncoding Holds the output of the tokenizer's call function. - * @property {BatchEncodingItem} input_ids List of token ids to be fed to a model. - * @property {BatchEncodingItem} attention_mask List of indices specifying which tokens should be attended to by the model. - * @property {BatchEncodingItem} [token_type_ids] List of token type ids to be fed to a model. - */ - /** - * Encode/tokenize the given text(s). - * @param {string|string[]} text The text to tokenize. - * @param {Object} options An optional object containing the following properties: - * @param {string|string[]} [options.text_pair=null] Optional second sequence to be encoded. If set, must be the same type as text. - * @param {boolean|'max_length'} [options.padding=false] Whether to pad the input sequences. - * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. - * @param {boolean} [options.truncation=null] Whether to truncate the input sequences. - * @param {number} [options.max_length=null] Maximum length of the returned list and optionally padding length. - * @param {boolean} [options.return_tensor=true] Whether to return the results as Tensors or arrays. - * @returns {BatchEncoding} Object to be passed to the model. - */ - _call(text: string | string[], { text_pair, add_special_tokens, padding, truncation, max_length, return_tensor, }?: { - text_pair?: string | string[]; - padding?: boolean | 'max_length'; - add_special_tokens?: boolean; - truncation?: boolean; - max_length?: number; - return_tensor?: boolean; - }): { - /** - * List of token ids to be fed to a model. - */ - input_ids: number[] | Tensor | number[][]; - /** - * List of indices specifying which tokens should be attended to by the model. - */ - attention_mask: number[] | Tensor | number[][]; - /** - * List of token type ids to be fed to a model. - */ - token_type_ids?: number[] | Tensor | number[][]; - }; - /** - * Encodes a single text using the preprocessor pipeline of the tokenizer. - * - * @param {string|null} text The text to encode. - * @returns {string[]|null} The encoded tokens. - */ - _encode_text(text: string | null): string[] | null; - /** - * Encodes a single text or a pair of texts using the model's tokenizer. - * - * @param {string} text The text to encode. - * @param {string|null} text_pair The optional second text to encode. - * @param {Object} options An optional object containing the following properties: - * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. - * @returns {EncodingSingle} An object containing the encoded text. - * @private - */ - private _encode_plus; - /** - * Encodes a single text or a pair of texts using the model's tokenizer. - * - * @param {string} text The text to encode. - * @param {string|null} text_pair The optional second text to encode. - * @param {Object} options An optional object containing the following properties: - * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. - * @returns {number[]} An array of token IDs representing the encoded text(s). - */ - encode(text: string, text_pair?: string | null, { add_special_tokens, }?: { - add_special_tokens?: boolean; - }): number[]; - /** - * Decode a batch of tokenized sequences. - * @param {number[][]|Tensor} batch List/Tensor of tokenized input sequences. - * @param {Object} decode_args (Optional) Object with decoding arguments. - * @returns {string[]} List of decoded sequences. - */ - batch_decode(batch: number[][] | Tensor, decode_args?: any): string[]; - /** - * Decodes a sequence of token IDs back to a string. - * - * @param {number[]|Tensor} token_ids List/Tensor of token IDs to decode. - * @param {Object} [decode_args={}] - * @param {boolean} [decode_args.skip_special_tokens=false] If true, special tokens are removed from the output string. - * @param {boolean} [decode_args.clean_up_tokenization_spaces=true] If true, spaces before punctuations and abbreviated forms are removed. - * - * @returns {string} The decoded string. - * @throws {Error} If `token_ids` is not a non-empty array of integers. - */ - decode(token_ids: number[] | Tensor, decode_args?: { - skip_special_tokens?: boolean; - clean_up_tokenization_spaces?: boolean; - }): string; - /** - * Decode a single list of token ids to a string. - * @param {number[]} token_ids List of token ids to decode - * @param {Object} decode_args Optional arguments for decoding - * @param {boolean} [decode_args.skip_special_tokens=false] Whether to skip special tokens during decoding - * @param {boolean} [decode_args.clean_up_tokenization_spaces=null] Whether to clean up tokenization spaces during decoding. - * If null, the value is set to `this.decoder.cleanup` if it exists, falling back to `this.clean_up_tokenization_spaces` if it exists, falling back to `true`. - * @returns {string} The decoded string - */ - decode_single(token_ids: number[], { skip_special_tokens, clean_up_tokenization_spaces, }: { - skip_special_tokens?: boolean; - clean_up_tokenization_spaces?: boolean; - }): string; - get default_chat_template(): string; - _warned_about_chat_template: boolean; - /** - * @typedef {Object} Message - * @property {string} role The role of the message (e.g., "user" or "assistant" or "system"). - * @property {string} content The content of the message. - */ - /** - * Converts a list of message objects with `"role"` and `"content"` keys to a list of token - * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to - * determine the format and control tokens to use when converting. When chat_template is None, it will fall back - * to the default_chat_template specified at the class level. - * - * See [here](https://huggingface.co/docs/transformers/chat_templating) for more information. - * - * **Example:** Applying a chat template to a conversation. - * - * ```javascript - * import { AutoTokenizer } from "@xenova/transformers"; - * - * const tokenizer = await AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1"); - * - * const chat = [ - * { "role": "user", "content": "Hello, how are you?" }, - * { "role": "assistant", "content": "I'm doing great. How can I help you today?" }, - * { "role": "user", "content": "I'd like to show off how chat templating works!" }, - * ] - * - * const text = tokenizer.apply_chat_template(chat, { tokenize: false }); - * // "[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]" - * - * const input_ids = tokenizer.apply_chat_template(chat, { tokenize: true, return_tensor: false }); - * // [1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793] - * ``` - * - * @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys. - * @param {Object} options An optional object containing the following properties: - * @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If - * this is not passed, the model's default chat template will be used instead. - * @param {boolean} [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate - * the start of an assistant message. This is useful when you want to generate a response from the model. - * Note that this argument will be passed to the chat template, and so it must be supported in the - * template for this argument to have any effect. - * @param {boolean} [options.tokenize=true] Whether to tokenize the output. If false, the output will be a string. - * @param {boolean} [options.padding=false] Whether to pad sequences to the maximum length. Has no effect if tokenize is false. - * @param {boolean} [options.truncation=false] Whether to truncate sequences to the maximum length. Has no effect if tokenize is false. - * @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false. - * If not specified, the tokenizer's `max_length` attribute will be used as a default. - * @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false. - * @returns {string | Tensor | number[]| number[][]} The tokenized output. - */ - apply_chat_template(conversation: { - /** - * The role of the message (e.g., "user" or "assistant" or "system"). - */ - role: string; - /** - * The content of the message. - */ - content: string; - }[], { chat_template, add_generation_prompt, tokenize, padding, truncation, max_length, return_tensor, }?: { - chat_template?: string; - add_generation_prompt?: boolean; - tokenize?: boolean; - padding?: boolean; - truncation?: boolean; - max_length?: number; - return_tensor?: boolean; - }): string | Tensor | number[] | number[][]; + token_type_ids?: number[] | Tensor | number[][]; + }; + /** + * Encodes a single text using the preprocessor pipeline of the tokenizer. + * + * @param {string|null} text The text to encode. + * @returns {string[]|null} The encoded tokens. + */ + _encode_text(text: string | null): string[] | null; + /** + * Encodes a single text or a pair of texts using the model's tokenizer. + * + * @param {string} text The text to encode. + * @param {string|null} text_pair The optional second text to encode. + * @param {Object} options An optional object containing the following properties: + * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. + * @returns {EncodingSingle} An object containing the encoded text. + * @private + */ + private _encode_plus; + /** + * Encodes a single text or a pair of texts using the model's tokenizer. + * + * @param {string} text The text to encode. + * @param {string|null} text_pair The optional second text to encode. + * @param {Object} options An optional object containing the following properties: + * @param {boolean} [options.add_special_tokens=true] Whether or not to add the special tokens associated with the corresponding model. + * @returns {number[]} An array of token IDs representing the encoded text(s). + */ + encode( + text: string, + text_pair?: string | null, + { + add_special_tokens, + }?: { + add_special_tokens?: boolean; + }, + ): number[]; + /** + * Decode a batch of tokenized sequences. + * @param {number[][]|Tensor} batch List/Tensor of tokenized input sequences. + * @param {Object} decode_args (Optional) Object with decoding arguments. + * @returns {string[]} List of decoded sequences. + */ + batch_decode(batch: number[][] | Tensor, decode_args?: any): string[]; + /** + * Decodes a sequence of token IDs back to a string. + * + * @param {number[]|Tensor} token_ids List/Tensor of token IDs to decode. + * @param {Object} [decode_args={}] + * @param {boolean} [decode_args.skip_special_tokens=false] If true, special tokens are removed from the output string. + * @param {boolean} [decode_args.clean_up_tokenization_spaces=true] If true, spaces before punctuations and abbreviated forms are removed. + * + * @returns {string} The decoded string. + * @throws {Error} If `token_ids` is not a non-empty array of integers. + */ + decode( + token_ids: number[] | Tensor, + decode_args?: { + skip_special_tokens?: boolean; + clean_up_tokenization_spaces?: boolean; + }, + ): string; + /** + * Decode a single list of token ids to a string. + * @param {number[]} token_ids List of token ids to decode + * @param {Object} decode_args Optional arguments for decoding + * @param {boolean} [decode_args.skip_special_tokens=false] Whether to skip special tokens during decoding + * @param {boolean} [decode_args.clean_up_tokenization_spaces=null] Whether to clean up tokenization spaces during decoding. + * If null, the value is set to `this.decoder.cleanup` if it exists, falling back to `this.clean_up_tokenization_spaces` if it exists, falling back to `true`. + * @returns {string} The decoded string + */ + decode_single( + token_ids: number[], + { + skip_special_tokens, + clean_up_tokenization_spaces, + }: { + skip_special_tokens?: boolean; + clean_up_tokenization_spaces?: boolean; + }, + ): string; + get default_chat_template(): string; + _warned_about_chat_template: boolean; + /** + * @typedef {Object} Message + * @property {string} role The role of the message (e.g., "user" or "assistant" or "system"). + * @property {string} content The content of the message. + */ + /** + * Converts a list of message objects with `"role"` and `"content"` keys to a list of token + * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to + * determine the format and control tokens to use when converting. When chat_template is None, it will fall back + * to the default_chat_template specified at the class level. + * + * See [here](https://huggingface.co/docs/transformers/chat_templating) for more information. + * + * **Example:** Applying a chat template to a conversation. + * + * ```javascript + * import { AutoTokenizer } from "@xenova/transformers"; + * + * const tokenizer = await AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1"); + * + * const chat = [ + * { "role": "user", "content": "Hello, how are you?" }, + * { "role": "assistant", "content": "I'm doing great. How can I help you today?" }, + * { "role": "user", "content": "I'd like to show off how chat templating works!" }, + * ] + * + * const text = tokenizer.apply_chat_template(chat, { tokenize: false }); + * // "[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]" + * + * const input_ids = tokenizer.apply_chat_template(chat, { tokenize: true, return_tensor: false }); + * // [1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793] + * ``` + * + * @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys. + * @param {Object} options An optional object containing the following properties: + * @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If + * this is not passed, the model's default chat template will be used instead. + * @param {boolean} [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate + * the start of an assistant message. This is useful when you want to generate a response from the model. + * Note that this argument will be passed to the chat template, and so it must be supported in the + * template for this argument to have any effect. + * @param {boolean} [options.tokenize=true] Whether to tokenize the output. If false, the output will be a string. + * @param {boolean} [options.padding=false] Whether to pad sequences to the maximum length. Has no effect if tokenize is false. + * @param {boolean} [options.truncation=false] Whether to truncate sequences to the maximum length. Has no effect if tokenize is false. + * @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false. + * If not specified, the tokenizer's `max_length` attribute will be used as a default. + * @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false. + * @returns {string | Tensor | number[]| number[][]} The tokenized output. + */ + apply_chat_template( + conversation: { + /** + * The role of the message (e.g., "user" or "assistant" or "system"). + */ + role: string; + /** + * The content of the message. + */ + content: string; + }[], + { + chat_template, + add_generation_prompt, + tokenize, + padding, + truncation, + max_length, + return_tensor, + }?: { + chat_template?: string; + add_generation_prompt?: boolean; + tokenize?: boolean; + padding?: boolean; + truncation?: boolean; + max_length?: number; + return_tensor?: boolean; + }, + ): string | Tensor | number[] | number[][]; } /** * BertTokenizer is a class used to tokenize text for BERT models. * @extends PreTrainedTokenizer */ -export class BertTokenizer extends PreTrainedTokenizer { -} +export class BertTokenizer extends PreTrainedTokenizer {} /** * Albert tokenizer * @extends PreTrainedTokenizer */ -export class AlbertTokenizer extends PreTrainedTokenizer { -} -export class MobileBertTokenizer extends PreTrainedTokenizer { -} -export class SqueezeBertTokenizer extends PreTrainedTokenizer { -} -export class DebertaTokenizer extends PreTrainedTokenizer { -} -export class DebertaV2Tokenizer extends PreTrainedTokenizer { -} -export class HerbertTokenizer extends PreTrainedTokenizer { -} -export class ConvBertTokenizer extends PreTrainedTokenizer { -} -export class RoFormerTokenizer extends PreTrainedTokenizer { -} -export class DistilBertTokenizer extends PreTrainedTokenizer { -} -export class CamembertTokenizer extends PreTrainedTokenizer { -} +export class AlbertTokenizer extends PreTrainedTokenizer {} +export class MobileBertTokenizer extends PreTrainedTokenizer {} +export class SqueezeBertTokenizer extends PreTrainedTokenizer {} +export class DebertaTokenizer extends PreTrainedTokenizer {} +export class DebertaV2Tokenizer extends PreTrainedTokenizer {} +export class HerbertTokenizer extends PreTrainedTokenizer {} +export class ConvBertTokenizer extends PreTrainedTokenizer {} +export class RoFormerTokenizer extends PreTrainedTokenizer {} +export class DistilBertTokenizer extends PreTrainedTokenizer {} +export class CamembertTokenizer extends PreTrainedTokenizer {} export class XLMTokenizer extends PreTrainedTokenizer { - constructor(tokenizerJSON: any, tokenizerConfig: any); -} -export class ElectraTokenizer extends PreTrainedTokenizer { -} -export class T5Tokenizer extends PreTrainedTokenizer { -} -export class GPT2Tokenizer extends PreTrainedTokenizer { -} -export class BartTokenizer extends PreTrainedTokenizer { + constructor(tokenizerJSON: any, tokenizerConfig: any); } +export class ElectraTokenizer extends PreTrainedTokenizer {} +export class T5Tokenizer extends PreTrainedTokenizer {} +export class GPT2Tokenizer extends PreTrainedTokenizer {} +export class BartTokenizer extends PreTrainedTokenizer {} export class MBartTokenizer extends PreTrainedTokenizer { - constructor(tokenizerJSON: any, tokenizerConfig: any); - languageRegex: RegExp; - language_codes: any[]; - lang_to_token: (x: any) => any; - /** - * Helper function to build translation inputs for an `MBartTokenizer`. - * @param {string|string[]} raw_inputs The text to tokenize. - * @param {Object} tokenizer_options Options to be sent to the tokenizer - * @param {Object} generate_kwargs Generation options. - * @returns {Object} Object to be passed to the model. - */ - _build_translation_inputs(raw_inputs: string | string[], tokenizer_options: any, generate_kwargs: any): any; -} -export class MBart50Tokenizer extends MBartTokenizer { -} -export class RobertaTokenizer extends PreTrainedTokenizer { + constructor(tokenizerJSON: any, tokenizerConfig: any); + languageRegex: RegExp; + language_codes: any[]; + lang_to_token: (x: any) => any; + /** + * Helper function to build translation inputs for an `MBartTokenizer`. + * @param {string|string[]} raw_inputs The text to tokenize. + * @param {Object} tokenizer_options Options to be sent to the tokenizer + * @param {Object} generate_kwargs Generation options. + * @returns {Object} Object to be passed to the model. + */ + _build_translation_inputs( + raw_inputs: string | string[], + tokenizer_options: any, + generate_kwargs: any, + ): any; } +export class MBart50Tokenizer extends MBartTokenizer {} +export class RobertaTokenizer extends PreTrainedTokenizer {} export class BloomTokenizer extends GPT2Tokenizer { - constructor(tokenizerJSON: any, tokenizerConfig: any); + constructor(tokenizerJSON: any, tokenizerConfig: any); } export class LlamaTokenizer extends PreTrainedTokenizer { - constructor(tokenizerJSON: any, tokenizerConfig: any); - DEFAULT_SYSTEM_PROMPT: string; - use_default_system_prompt: any; - legacy: any; - get default_chat_template(): any; -} -export class CodeLlamaTokenizer extends LlamaTokenizer { -} -export class XLMRobertaTokenizer extends PreTrainedTokenizer { -} -export class MPNetTokenizer extends PreTrainedTokenizer { -} -export class FalconTokenizer extends PreTrainedTokenizer { -} -export class GPTNeoXTokenizer extends PreTrainedTokenizer { -} -export class EsmTokenizer extends PreTrainedTokenizer { + constructor(tokenizerJSON: any, tokenizerConfig: any); + DEFAULT_SYSTEM_PROMPT: string; + use_default_system_prompt: any; + legacy: any; + get default_chat_template(): any; } +export class CodeLlamaTokenizer extends LlamaTokenizer {} +export class XLMRobertaTokenizer extends PreTrainedTokenizer {} +export class MPNetTokenizer extends PreTrainedTokenizer {} +export class FalconTokenizer extends PreTrainedTokenizer {} +export class GPTNeoXTokenizer extends PreTrainedTokenizer {} +export class EsmTokenizer extends PreTrainedTokenizer {} /** * The NllbTokenizer class is used to tokenize text for NLLB ("No Language Left Behind") models. * @@ -391,18 +418,22 @@ export class EsmTokenizer extends PreTrainedTokenizer { * @see {@link https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200} */ export class NllbTokenizer extends PreTrainedTokenizer { - constructor(tokenizerJSON: any, tokenizerConfig: any); - languageRegex: RegExp; - language_codes: any[]; - lang_to_token: (x: any) => any; - /** - * Helper function to build translation inputs for an `NllbTokenizer`. - * @param {string|string[]} raw_inputs The text to tokenize. - * @param {Object} tokenizer_options Options to be sent to the tokenizer - * @param {Object} generate_kwargs Generation options. - * @returns {Object} Object to be passed to the model. - */ - _build_translation_inputs(raw_inputs: string | string[], tokenizer_options: any, generate_kwargs: any): any; + constructor(tokenizerJSON: any, tokenizerConfig: any); + languageRegex: RegExp; + language_codes: any[]; + lang_to_token: (x: any) => any; + /** + * Helper function to build translation inputs for an `NllbTokenizer`. + * @param {string|string[]} raw_inputs The text to tokenize. + * @param {Object} tokenizer_options Options to be sent to the tokenizer + * @param {Object} generate_kwargs Generation options. + * @returns {Object} Object to be passed to the model. + */ + _build_translation_inputs( + raw_inputs: string | string[], + tokenizer_options: any, + generate_kwargs: any, + ): any; } /** * The M2M100Tokenizer class is used to tokenize text for M2M100 ("Many-to-Many") models. @@ -415,157 +446,170 @@ export class NllbTokenizer extends PreTrainedTokenizer { * @see {@link https://huggingface.co/facebook/m2m100_418M#languages-covered} */ export class M2M100Tokenizer extends PreTrainedTokenizer { - constructor(tokenizerJSON: any, tokenizerConfig: any); - languageRegex: RegExp; - language_codes: any[]; - lang_to_token: (x: any) => string; - /** - * Helper function to build translation inputs for an `M2M100Tokenizer`. - * @param {string|string[]} raw_inputs The text to tokenize. - * @param {Object} tokenizer_options Options to be sent to the tokenizer - * @param {Object} generate_kwargs Generation options. - * @returns {Object} Object to be passed to the model. - */ - _build_translation_inputs(raw_inputs: string | string[], tokenizer_options: any, generate_kwargs: any): any; + constructor(tokenizerJSON: any, tokenizerConfig: any); + languageRegex: RegExp; + language_codes: any[]; + lang_to_token: (x: any) => string; + /** + * Helper function to build translation inputs for an `M2M100Tokenizer`. + * @param {string|string[]} raw_inputs The text to tokenize. + * @param {Object} tokenizer_options Options to be sent to the tokenizer + * @param {Object} generate_kwargs Generation options. + * @returns {Object} Object to be passed to the model. + */ + _build_translation_inputs( + raw_inputs: string | string[], + tokenizer_options: any, + generate_kwargs: any, + ): any; } /** * WhisperTokenizer tokenizer * @extends PreTrainedTokenizer */ export class WhisperTokenizer extends PreTrainedTokenizer { - /** - * Decodes automatic speech recognition (ASR) sequences. - * @param {Array<{tokens: number[], token_timestamps?: number[], stride: number[]}>} sequences The sequences to decode. - * @param {Object} options The options to use for decoding. - * @returns {Array, text: string}>}>} The decoded sequences. - */ - _decode_asr(sequences: Array<{ - tokens: number[]; - token_timestamps?: number[]; - stride: number[]; - }>, { return_timestamps, return_language, time_precision, force_full_sequences }?: any): (string | { - chunks?: undefined | Array<{ - language: string | null; - timestamp: Array; - text: string; - }>; - })[]; - /** - * Finds the longest common sequence among the provided sequences. - * @param {number[][]} sequences An array of sequences of token ids to compare. - * @returns {number[][]} The longest common sequence found. - * @throws {Error} If there is a bug within the function. - * @private - */ - private findLongestCommonSequence; - /** @private */ - private collateWordTimestamps; - /** - * Groups tokens by word. Returns a tuple containing a list of strings with the words, - * and a list of `token_id` sequences with the tokens making up each word. - * @param {number[]} tokens - * @param {string} [language] - * @param {string} prepend_punctionations - * @param {string} append_punctuations - * - * @private - */ - private combineTokensIntoWords; - /** - * @param {number[]} token_ids List of token IDs to decode. - * @param {Object} decode_args Optional arguments for decoding - * @private - */ - private decodeWithTimestamps; - /** - * Combine tokens into words by splitting at any position where the tokens are decoded as valid unicode points. - * @param {number[]} tokens - * @returns {*} - * @private - */ - private splitTokensOnUnicode; - /** - * Combine tokens into words by splitting at whitespace and punctuation tokens. - * @param {number[]} tokens - * @private - */ - private splitTokensOnSpaces; - /** - * Merges punctuation tokens with neighboring words. - * @param {string[]} words - * @param {number[][]} tokens - * @param {number[][]} indices - * @param {string} prepended - * @param {string} appended - * @private - */ - private mergePunctuations; - /** - * Helper function to build translation inputs for a `WhisperTokenizer`, - * depending on the language, task, and whether to predict timestamp tokens. - * - * Used to override the prefix tokens appended to the start of the label sequence. - * - * **Example: Get ids for a language** - * ```javascript - * // instantiate the tokenizer and set the prefix token to Spanish - * const tokenizer = await WhisperTokenizer.from_pretrained('Xenova/whisper-tiny'); - * const forced_decoder_ids = tokenizer.get_decoder_prompt_ids({ language: 'spanish' }); - * // [(1, 50262), (2, 50363)] - * ``` - * - * @param {Object} options Options to generate the decoder prompt. - * @param {string} [options.language] The language of the transcription text. - * The corresponding language id token is appended to the start of the sequence for multilingual - * speech recognition and speech translation tasks, e.g. for "Spanish" the token "<|es|>" is appended - * to the start of sequence. - * @param {string} [options.task] Task identifier to append at the start of sequence (if any). - * This should be used for mulitlingual fine-tuning, with "transcribe" for speech recognition and - * "translate" for speech translation. - * @param {boolean} [options.no_timestamps] Whether to add the <|notimestamps|> token at the start of the sequence. - * @returns {number[][]} The decoder prompt ids. - */ - get_decoder_prompt_ids({ language, task, no_timestamps, }?: { - language?: string; - task?: string; - no_timestamps?: boolean; - }): number[][]; -} -export class CodeGenTokenizer extends PreTrainedTokenizer { -} -export class CLIPTokenizer extends PreTrainedTokenizer { -} -export class SiglipTokenizer extends PreTrainedTokenizer { + /** + * Decodes automatic speech recognition (ASR) sequences. + * @param {Array<{tokens: number[], token_timestamps?: number[], stride: number[]}>} sequences The sequences to decode. + * @param {Object} options The options to use for decoding. + * @returns {Array, text: string}>}>} The decoded sequences. + */ + _decode_asr( + sequences: Array<{ + tokens: number[]; + token_timestamps?: number[]; + stride: number[]; + }>, + { + return_timestamps, + return_language, + time_precision, + force_full_sequences, + }?: any, + ): ( + | string + | { + chunks?: + | undefined + | Array<{ + language: string | null; + timestamp: Array; + text: string; + }>; + } + )[]; + /** + * Finds the longest common sequence among the provided sequences. + * @param {number[][]} sequences An array of sequences of token ids to compare. + * @returns {number[][]} The longest common sequence found. + * @throws {Error} If there is a bug within the function. + * @private + */ + private findLongestCommonSequence; + /** @private */ + private collateWordTimestamps; + /** + * Groups tokens by word. Returns a tuple containing a list of strings with the words, + * and a list of `token_id` sequences with the tokens making up each word. + * @param {number[]} tokens + * @param {string} [language] + * @param {string} prepend_punctionations + * @param {string} append_punctuations + * + * @private + */ + private combineTokensIntoWords; + /** + * @param {number[]} token_ids List of token IDs to decode. + * @param {Object} decode_args Optional arguments for decoding + * @private + */ + private decodeWithTimestamps; + /** + * Combine tokens into words by splitting at any position where the tokens are decoded as valid unicode points. + * @param {number[]} tokens + * @returns {*} + * @private + */ + private splitTokensOnUnicode; + /** + * Combine tokens into words by splitting at whitespace and punctuation tokens. + * @param {number[]} tokens + * @private + */ + private splitTokensOnSpaces; + /** + * Merges punctuation tokens with neighboring words. + * @param {string[]} words + * @param {number[][]} tokens + * @param {number[][]} indices + * @param {string} prepended + * @param {string} appended + * @private + */ + private mergePunctuations; + /** + * Helper function to build translation inputs for a `WhisperTokenizer`, + * depending on the language, task, and whether to predict timestamp tokens. + * + * Used to override the prefix tokens appended to the start of the label sequence. + * + * **Example: Get ids for a language** + * ```javascript + * // instantiate the tokenizer and set the prefix token to Spanish + * const tokenizer = await WhisperTokenizer.from_pretrained('Xenova/whisper-tiny'); + * const forced_decoder_ids = tokenizer.get_decoder_prompt_ids({ language: 'spanish' }); + * // [(1, 50262), (2, 50363)] + * ``` + * + * @param {Object} options Options to generate the decoder prompt. + * @param {string} [options.language] The language of the transcription text. + * The corresponding language id token is appended to the start of the sequence for multilingual + * speech recognition and speech translation tasks, e.g. for "Spanish" the token "<|es|>" is appended + * to the start of sequence. + * @param {string} [options.task] Task identifier to append at the start of sequence (if any). + * This should be used for mulitlingual fine-tuning, with "transcribe" for speech recognition and + * "translate" for speech translation. + * @param {boolean} [options.no_timestamps] Whether to add the <|notimestamps|> token at the start of the sequence. + * @returns {number[][]} The decoder prompt ids. + */ + get_decoder_prompt_ids({ + language, + task, + no_timestamps, + }?: { + language?: string; + task?: string; + no_timestamps?: boolean; + }): number[][]; } +export class CodeGenTokenizer extends PreTrainedTokenizer {} +export class CLIPTokenizer extends PreTrainedTokenizer {} +export class SiglipTokenizer extends PreTrainedTokenizer {} /** * @todo This model is not yet supported by Hugging Face's "fast" tokenizers library (https://github.com/huggingface/tokenizers). * Therefore, this implementation (which is based on fast tokenizers) may produce slightly inaccurate results. */ export class MarianTokenizer extends PreTrainedTokenizer { - languageRegex: RegExp; - supported_language_codes: string[]; - /** - * Encodes a single text. Overriding this method is necessary since the language codes - * must be removed before encoding with sentencepiece model. - * @see https://github.com/huggingface/transformers/blob/12d51db243a00726a548a43cc333390ebae731e3/src/transformers/models/marian/tokenization_marian.py#L204-L213 - * - * @param {string|null} text The text to encode. - * @returns {Array} The encoded tokens. - */ - _encode_text(text: string | null): any[]; -} -export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer { -} -export class BlenderbotTokenizer extends PreTrainedTokenizer { -} -export class BlenderbotSmallTokenizer extends BlenderbotTokenizer { -} -export class SpeechT5Tokenizer extends PreTrainedTokenizer { -} -export class NougatTokenizer extends PreTrainedTokenizer { + languageRegex: RegExp; + supported_language_codes: string[]; + /** + * Encodes a single text. Overriding this method is necessary since the language codes + * must be removed before encoding with sentencepiece model. + * @see https://github.com/huggingface/transformers/blob/12d51db243a00726a548a43cc333390ebae731e3/src/transformers/models/marian/tokenization_marian.py#L204-L213 + * + * @param {string|null} text The text to encode. + * @returns {Array} The encoded tokens. + */ + _encode_text(text: string | null): any[]; } +export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer {} +export class BlenderbotTokenizer extends PreTrainedTokenizer {} +export class BlenderbotSmallTokenizer extends BlenderbotTokenizer {} +export class SpeechT5Tokenizer extends PreTrainedTokenizer {} +export class NougatTokenizer extends PreTrainedTokenizer {} export class VitsTokenizer extends PreTrainedTokenizer { - constructor(tokenizerJSON: any, tokenizerConfig: any); + constructor(tokenizerJSON: any, tokenizerConfig: any); } /** * Helper class which is used to instantiate pretrained tokenizers with the `from_pretrained` function. @@ -575,163 +619,180 @@ export class VitsTokenizer extends PreTrainedTokenizer { * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/bert-base-uncased'); */ export class AutoTokenizer { - static TOKENIZER_CLASS_MAPPING: { - T5Tokenizer: typeof T5Tokenizer; - DistilBertTokenizer: typeof DistilBertTokenizer; - CamembertTokenizer: typeof CamembertTokenizer; - DebertaTokenizer: typeof DebertaTokenizer; - DebertaV2Tokenizer: typeof DebertaV2Tokenizer; - BertTokenizer: typeof BertTokenizer; - HerbertTokenizer: typeof HerbertTokenizer; - ConvBertTokenizer: typeof ConvBertTokenizer; - RoFormerTokenizer: typeof RoFormerTokenizer; - XLMTokenizer: typeof XLMTokenizer; - ElectraTokenizer: typeof ElectraTokenizer; - MobileBertTokenizer: typeof MobileBertTokenizer; - SqueezeBertTokenizer: typeof SqueezeBertTokenizer; - AlbertTokenizer: typeof AlbertTokenizer; - GPT2Tokenizer: typeof GPT2Tokenizer; - BartTokenizer: typeof BartTokenizer; - MBartTokenizer: typeof MBartTokenizer; - MBart50Tokenizer: typeof MBart50Tokenizer; - RobertaTokenizer: typeof RobertaTokenizer; - WhisperTokenizer: typeof WhisperTokenizer; - CodeGenTokenizer: typeof CodeGenTokenizer; - CLIPTokenizer: typeof CLIPTokenizer; - SiglipTokenizer: typeof SiglipTokenizer; - MarianTokenizer: typeof MarianTokenizer; - BloomTokenizer: typeof BloomTokenizer; - NllbTokenizer: typeof NllbTokenizer; - M2M100Tokenizer: typeof M2M100Tokenizer; - LlamaTokenizer: typeof LlamaTokenizer; - CodeLlamaTokenizer: typeof CodeLlamaTokenizer; - XLMRobertaTokenizer: typeof XLMRobertaTokenizer; - MPNetTokenizer: typeof MPNetTokenizer; - FalconTokenizer: typeof FalconTokenizer; - GPTNeoXTokenizer: typeof GPTNeoXTokenizer; - EsmTokenizer: typeof EsmTokenizer; - Wav2Vec2CTCTokenizer: typeof Wav2Vec2CTCTokenizer; - BlenderbotTokenizer: typeof BlenderbotTokenizer; - BlenderbotSmallTokenizer: typeof BlenderbotSmallTokenizer; - SpeechT5Tokenizer: typeof SpeechT5Tokenizer; - NougatTokenizer: typeof NougatTokenizer; - VitsTokenizer: typeof VitsTokenizer; - PreTrainedTokenizer: typeof PreTrainedTokenizer; - }; - /** - * Instantiate one of the tokenizer classes of the library from a pretrained model. - * - * The tokenizer class to instantiate is selected based on the `tokenizer_class` property of the config object - * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) - * - * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: - * - A string, the *model id* of a pretrained tokenizer hosted inside a model repo on huggingface.co. - * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a - * user or organization name, like `dbmdz/bert-base-german-cased`. - * - A path to a *directory* containing tokenizer files, e.g., `./my_model_directory/`. - * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. - * - * @returns {Promise} A new instance of the PreTrainedTokenizer class. - */ - static from_pretrained(pretrained_model_name_or_path: string, { quantized, progress_callback, config, cache_dir, local_files_only, revision, legacy, }?: PretrainedTokenizerOptions): Promise; + static TOKENIZER_CLASS_MAPPING: { + T5Tokenizer: typeof T5Tokenizer; + DistilBertTokenizer: typeof DistilBertTokenizer; + CamembertTokenizer: typeof CamembertTokenizer; + DebertaTokenizer: typeof DebertaTokenizer; + DebertaV2Tokenizer: typeof DebertaV2Tokenizer; + BertTokenizer: typeof BertTokenizer; + HerbertTokenizer: typeof HerbertTokenizer; + ConvBertTokenizer: typeof ConvBertTokenizer; + RoFormerTokenizer: typeof RoFormerTokenizer; + XLMTokenizer: typeof XLMTokenizer; + ElectraTokenizer: typeof ElectraTokenizer; + MobileBertTokenizer: typeof MobileBertTokenizer; + SqueezeBertTokenizer: typeof SqueezeBertTokenizer; + AlbertTokenizer: typeof AlbertTokenizer; + GPT2Tokenizer: typeof GPT2Tokenizer; + BartTokenizer: typeof BartTokenizer; + MBartTokenizer: typeof MBartTokenizer; + MBart50Tokenizer: typeof MBart50Tokenizer; + RobertaTokenizer: typeof RobertaTokenizer; + WhisperTokenizer: typeof WhisperTokenizer; + CodeGenTokenizer: typeof CodeGenTokenizer; + CLIPTokenizer: typeof CLIPTokenizer; + SiglipTokenizer: typeof SiglipTokenizer; + MarianTokenizer: typeof MarianTokenizer; + BloomTokenizer: typeof BloomTokenizer; + NllbTokenizer: typeof NllbTokenizer; + M2M100Tokenizer: typeof M2M100Tokenizer; + LlamaTokenizer: typeof LlamaTokenizer; + CodeLlamaTokenizer: typeof CodeLlamaTokenizer; + XLMRobertaTokenizer: typeof XLMRobertaTokenizer; + MPNetTokenizer: typeof MPNetTokenizer; + FalconTokenizer: typeof FalconTokenizer; + GPTNeoXTokenizer: typeof GPTNeoXTokenizer; + EsmTokenizer: typeof EsmTokenizer; + Wav2Vec2CTCTokenizer: typeof Wav2Vec2CTCTokenizer; + BlenderbotTokenizer: typeof BlenderbotTokenizer; + BlenderbotSmallTokenizer: typeof BlenderbotSmallTokenizer; + SpeechT5Tokenizer: typeof SpeechT5Tokenizer; + NougatTokenizer: typeof NougatTokenizer; + VitsTokenizer: typeof VitsTokenizer; + PreTrainedTokenizer: typeof PreTrainedTokenizer; + }; + /** + * Instantiate one of the tokenizer classes of the library from a pretrained model. + * + * The tokenizer class to instantiate is selected based on the `tokenizer_class` property of the config object + * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) + * + * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: + * - A string, the *model id* of a pretrained tokenizer hosted inside a model repo on huggingface.co. + * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + * user or organization name, like `dbmdz/bert-base-german-cased`. + * - A path to a *directory* containing tokenizer files, e.g., `./my_model_directory/`. + * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. + * + * @returns {Promise} A new instance of the PreTrainedTokenizer class. + */ + static from_pretrained( + pretrained_model_name_or_path: string, + { + quantized, + progress_callback, + config, + cache_dir, + local_files_only, + revision, + legacy, + }?: PretrainedTokenizerOptions, + ): Promise; } /** * Additional tokenizer-specific properties. */ export type TokenizerProperties = { - /** - * Whether or not the `legacy` behavior of the tokenizer should be used. - */ - legacy?: boolean; + /** + * Whether or not the `legacy` behavior of the tokenizer should be used. + */ + legacy?: boolean; }; -export type PretrainedTokenizerOptions = import('./utils/hub.js').PretrainedOptions & TokenizerProperties; +export type PretrainedTokenizerOptions = + import("./utils/hub.js").PretrainedOptions & TokenizerProperties; export type BPENode = { - /** - * The token associated with the node - */ - token: string; - /** - * A positional bias for the node. - */ - bias: number; - /** - * The score of the node. - */ - score?: number; - /** - * The previous node in the linked list. - */ - prev?: BPENode; - /** - * The next node in the linked list. - */ - next?: BPENode; + /** + * The token associated with the node + */ + token: string; + /** + * A positional bias for the node. + */ + bias: number; + /** + * The score of the node. + */ + score?: number; + /** + * The previous node in the linked list. + */ + prev?: BPENode; + /** + * The next node in the linked list. + */ + next?: BPENode; }; -export type SplitDelimiterBehavior = 'removed' | 'isolated' | 'mergedWithPrevious' | 'mergedWithNext' | 'contiguous'; +export type SplitDelimiterBehavior = + | "removed" + | "isolated" + | "mergedWithPrevious" + | "mergedWithNext" + | "contiguous"; export type PostProcessedOutput = { - /** - * List of token produced by the post-processor. - */ - tokens: string[]; - /** - * List of token type ids produced by the post-processor. - */ - token_type_ids?: number[]; + /** + * List of token produced by the post-processor. + */ + tokens: string[]; + /** + * List of token type ids produced by the post-processor. + */ + token_type_ids?: number[]; }; export type EncodingSingle = { - /** - * List of token ids to be fed to a model. - */ - input_ids: number[]; - /** - * List of token type ids to be fed to a model - */ - attention_mask: number[]; - /** - * List of indices specifying which tokens should be attended to by the model - */ - token_type_ids?: number[]; + /** + * List of token ids to be fed to a model. + */ + input_ids: number[]; + /** + * List of token type ids to be fed to a model + */ + attention_mask: number[]; + /** + * List of indices specifying which tokens should be attended to by the model + */ + token_type_ids?: number[]; }; declare const Normalizer_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * A base class for text normalization. * @abstract */ declare class Normalizer extends Normalizer_base { - /** - * Factory method for creating normalizers from config objects. - * @static - * @param {Object} config The configuration object for the normalizer. - * @returns {Normalizer} A Normalizer object. - * @throws {Error} If an unknown Normalizer type is specified in the config. - */ - static fromConfig(config: any): Normalizer; - /** - * @param {Object} config The configuration object for the normalizer. - */ - constructor(config: any); - config: any; - /** - * Normalize the input text. - * @abstract - * @param {string} text The text to normalize. - * @returns {string} The normalized text. - * @throws {Error} If this method is not implemented in a subclass. - */ - normalize(text: string): string; - /** - * Alias for {@link Normalizer#normalize}. - * @param {string} text The text to normalize. - * @returns {string} The normalized text. - */ - _call(text: string): string; + /** + * Factory method for creating normalizers from config objects. + * @static + * @param {Object} config The configuration object for the normalizer. + * @returns {Normalizer} A Normalizer object. + * @throws {Error} If an unknown Normalizer type is specified in the config. + */ + static fromConfig(config: any): Normalizer; + /** + * @param {Object} config The configuration object for the normalizer. + */ + constructor(config: any); + config: any; + /** + * Normalize the input text. + * @abstract + * @param {string} text The text to normalize. + * @returns {string} The normalized text. + * @throws {Error} If this method is not implemented in a subclass. + */ + normalize(text: string): string; + /** + * Alias for {@link Normalizer#normalize}. + * @param {string} text The text to normalize. + * @returns {string} The normalized text. + */ + _call(text: string): string; } declare const PreTokenizer_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * A callable class representing a pre-tokenizer used in tokenization. Subclasses @@ -739,7 +800,7 @@ declare const PreTokenizer_base: new () => { * @extends Callable */ declare class PreTokenizer extends PreTokenizer_base { - /** + /** * Factory method that returns an instance of a subclass of `PreTokenizer` based on the provided configuration. * * @static @@ -747,35 +808,35 @@ declare class PreTokenizer extends PreTokenizer_base { * @returns {PreTokenizer} An instance of a subclass of `PreTokenizer`. * @throws {Error} If the provided configuration object does not correspond to any known pre-tokenizer. */ - static fromConfig(config: any): PreTokenizer; - /** - * Method that should be implemented by subclasses to define the specific pre-tokenization logic. - * - * @abstract - * @param {string} text The text to pre-tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} The pre-tokenized text. - * @throws {Error} If the method is not implemented in the subclass. - */ - pre_tokenize_text(text: string, options?: any): string[]; - /** - * Tokenizes the given text into pre-tokens. - * @param {string|string[]} text The text or array of texts to pre-tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of pre-tokens. - */ - pre_tokenize(text: string | string[], options?: any): string[]; - /** - * Alias for {@link PreTokenizer#pre_tokenize}. - * @param {string|string[]} text The text or array of texts to pre-tokenize. - * @param {Object} [options] Additional options for the pre-tokenization logic. - * @returns {string[]} An array of pre-tokens. - */ - _call(text: string | string[], options?: any): string[]; + static fromConfig(config: any): PreTokenizer; + /** + * Method that should be implemented by subclasses to define the specific pre-tokenization logic. + * + * @abstract + * @param {string} text The text to pre-tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} The pre-tokenized text. + * @throws {Error} If the method is not implemented in the subclass. + */ + pre_tokenize_text(text: string, options?: any): string[]; + /** + * Tokenizes the given text into pre-tokens. + * @param {string|string[]} text The text or array of texts to pre-tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of pre-tokens. + */ + pre_tokenize(text: string | string[], options?: any): string[]; + /** + * Alias for {@link PreTokenizer#pre_tokenize}. + * @param {string|string[]} text The text or array of texts to pre-tokenize. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of pre-tokens. + */ + _call(text: string | string[], options?: any): string[]; } declare const PostProcessor_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * @typedef {Object} PostProcessedOutput @@ -792,85 +853,85 @@ declare const PostProcessor_base: new () => { * @extends Callable */ declare class PostProcessor extends PostProcessor_base { - /** - * Factory method to create a PostProcessor object from a configuration object. - * - * @param {Object} config Configuration object representing a PostProcessor. - * @returns {PostProcessor} A PostProcessor object created from the given configuration. - * @throws {Error} If an unknown PostProcessor type is encountered. - */ - static fromConfig(config: any): PostProcessor; - /** - * @param {Object} config The configuration for the post-processor. - */ - constructor(config: any); - config: any; - /** - * Method to be implemented in subclass to apply post-processing on the given tokens. - * - * @param {Array} tokens The input tokens to be post-processed. - * @param {...*} args Additional arguments required by the post-processing logic. - * @returns {PostProcessedOutput} The post-processed tokens. - * @throws {Error} If the method is not implemented in subclass. - */ - post_process(tokens: any[], ...args: any[]): PostProcessedOutput; - /** - * Alias for {@link PostProcessor#post_process}. - * @param {Array} tokens The text or array of texts to post-process. - * @param {...*} args Additional arguments required by the post-processing logic. - * @returns {PostProcessedOutput} The post-processed tokens. - */ - _call(tokens: any[], ...args: any[]): PostProcessedOutput; + /** + * Factory method to create a PostProcessor object from a configuration object. + * + * @param {Object} config Configuration object representing a PostProcessor. + * @returns {PostProcessor} A PostProcessor object created from the given configuration. + * @throws {Error} If an unknown PostProcessor type is encountered. + */ + static fromConfig(config: any): PostProcessor; + /** + * @param {Object} config The configuration for the post-processor. + */ + constructor(config: any); + config: any; + /** + * Method to be implemented in subclass to apply post-processing on the given tokens. + * + * @param {Array} tokens The input tokens to be post-processed. + * @param {...*} args Additional arguments required by the post-processing logic. + * @returns {PostProcessedOutput} The post-processed tokens. + * @throws {Error} If the method is not implemented in subclass. + */ + post_process(tokens: any[], ...args: any[]): PostProcessedOutput; + /** + * Alias for {@link PostProcessor#post_process}. + * @param {Array} tokens The text or array of texts to post-process. + * @param {...*} args Additional arguments required by the post-processing logic. + * @returns {PostProcessedOutput} The post-processed tokens. + */ + _call(tokens: any[], ...args: any[]): PostProcessedOutput; } declare const Decoder_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * The base class for token decoders. * @extends Callable */ declare class Decoder extends Decoder_base { - /** + /** * Creates a decoder instance based on the provided configuration. * * @param {Object} config The configuration object. * @returns {Decoder} A decoder instance. * @throws {Error} If an unknown decoder type is provided. */ - static fromConfig(config: any): Decoder; - /** - * Creates an instance of `Decoder`. - * - * @param {Object} config The configuration object. - */ - constructor(config: any); - config: any; - /** @type {AddedToken[]} */ - added_tokens: AddedToken[]; - end_of_word_suffix: any; - trim_offsets: any; - /** - * Calls the `decode` method. - * - * @param {string[]} tokens The list of tokens. - * @returns {string} The decoded string. - */ - _call(tokens: string[]): string; - /** - * Decodes a list of tokens. - * @param {string[]} tokens The list of tokens. - * @returns {string} The decoded string. - */ - decode(tokens: string[]): string; - /** - * Apply the decoder to a list of tokens. - * - * @param {string[]} tokens The list of tokens. - * @returns {string[]} The decoded list of tokens. - * @throws {Error} If the `decode_chain` method is not implemented in the subclass. - */ - decode_chain(tokens: string[]): string[]; + static fromConfig(config: any): Decoder; + /** + * Creates an instance of `Decoder`. + * + * @param {Object} config The configuration object. + */ + constructor(config: any); + config: any; + /** @type {AddedToken[]} */ + added_tokens: AddedToken[]; + end_of_word_suffix: any; + trim_offsets: any; + /** + * Calls the `decode` method. + * + * @param {string[]} tokens The list of tokens. + * @returns {string} The decoded string. + */ + _call(tokens: string[]): string; + /** + * Decodes a list of tokens. + * @param {string[]} tokens The list of tokens. + * @returns {string} The decoded string. + */ + decode(tokens: string[]): string; + /** + * Apply the decoder to a list of tokens. + * + * @param {string[]} tokens The list of tokens. + * @returns {string[]} The decoded list of tokens. + * @throws {Error} If the `decode_chain` method is not implemented in the subclass. + */ + decode_chain(tokens: string[]): string[]; } /** * Represent a token added by the user on top of the existing Model vocabulary. @@ -879,34 +940,34 @@ declare class Decoder extends Decoder_base { * - Whether to include any whitespace on its left or right */ declare class AddedToken { - /** - * Creates a new instance of AddedToken. - * @param {Object} config Added token configuration object. - * @param {string} config.content The content of the added token. - * @param {number} config.id The id of the added token. - * @param {boolean} [config.single_word=false] Whether this token must be a single word or can break words. - * @param {boolean} [config.lstrip=false] Whether this token should strip whitespaces on its left. - * @param {boolean} [config.rstrip=false] Whether this token should strip whitespaces on its right. - * @param {boolean} [config.normalized=false] Whether this token should be normalized. - * @param {boolean} [config.special=false] Whether this token is special. - */ - constructor(config: { - content: string; - id: number; - single_word?: boolean; - lstrip?: boolean; - rstrip?: boolean; - normalized?: boolean; - special?: boolean; - }); + /** + * Creates a new instance of AddedToken. + * @param {Object} config Added token configuration object. + * @param {string} config.content The content of the added token. + * @param {number} config.id The id of the added token. + * @param {boolean} [config.single_word=false] Whether this token must be a single word or can break words. + * @param {boolean} [config.lstrip=false] Whether this token should strip whitespaces on its left. + * @param {boolean} [config.rstrip=false] Whether this token should strip whitespaces on its right. + * @param {boolean} [config.normalized=false] Whether this token should be normalized. + * @param {boolean} [config.special=false] Whether this token is special. + */ + constructor(config: { content: string; id: number; - single_word: boolean; - lstrip: boolean; - rstrip: boolean; - special: boolean; - normalized: boolean; + single_word?: boolean; + lstrip?: boolean; + rstrip?: boolean; + normalized?: boolean; + special?: boolean; + }); + content: string; + id: number; + single_word: boolean; + lstrip: boolean; + rstrip: boolean; + special: boolean; + normalized: boolean; } -import { Tensor } from './utils/tensor.js'; +import { Tensor } from "./utils/tensor.js"; export {}; -//# sourceMappingURL=tokenizers.d.ts.map \ No newline at end of file +//# sourceMappingURL=tokenizers.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/transformers.d.ts b/core/vendor/modules/@xenova/transformers/types/transformers.d.ts index 3b83ad482..286778bdb 100644 --- a/core/vendor/modules/@xenova/transformers/types/transformers.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/transformers.d.ts @@ -8,4 +8,4 @@ export * from "./utils/audio.js"; export * from "./utils/image.js"; export * from "./utils/tensor.js"; export * from "./utils/maths.js"; -//# sourceMappingURL=transformers.d.ts.map \ No newline at end of file +//# sourceMappingURL=transformers.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/utils/audio.d.ts b/core/vendor/modules/@xenova/transformers/types/utils/audio.d.ts index a670b3658..015a494cf 100644 --- a/core/vendor/modules/@xenova/transformers/types/utils/audio.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/utils/audio.d.ts @@ -4,7 +4,10 @@ * @param {number} sampling_rate The sampling rate to use when decoding the audio. * @returns {Promise} The decoded audio as a `Float32Array`. */ -export function read_audio(url: string | URL, sampling_rate: number): Promise; +export function read_audio( + url: string | URL, + sampling_rate: number, +): Promise; /** * Generates a Hanning window of length M. * @@ -29,7 +32,16 @@ export function hanning(M: number): Float64Array; * @returns {number[][]} Triangular filter bank matrix, which is a 2D array of shape (`num_frequency_bins`, `num_mel_filters`). * This is a projection matrix to go from a spectrogram to a mel spectrogram. */ -export function mel_filter_bank(num_frequency_bins: number, num_mel_filters: number, min_frequency: number, max_frequency: number, sampling_rate: number, norm?: string, mel_scale?: string, triangularize_in_mel_space?: boolean): number[][]; +export function mel_filter_bank( + num_frequency_bins: number, + num_mel_filters: number, + min_frequency: number, + max_frequency: number, + sampling_rate: number, + norm?: string, + mel_scale?: string, + triangularize_in_mel_space?: boolean, +): number[][]; /** * Calculates a spectrogram over one waveform using the Short-Time Fourier Transform. * @@ -81,7 +93,29 @@ export function mel_filter_bank(num_frequency_bins: number, num_mel_filters: num * @param {boolean} [options.transpose=false] If `true`, the returned spectrogram will have shape `(num_frames, num_frequency_bins/num_mel_filters)`. If `false`, the returned spectrogram will have shape `(num_frequency_bins/num_mel_filters, num_frames)`. * @returns {{data: Float32Array, dims: number[]}} Spectrogram of shape `(num_frequency_bins, length)` (regular spectrogram) or shape `(num_mel_filters, length)` (mel spectrogram). */ -export function spectrogram(waveform: Float32Array | Float64Array, window: Float32Array | Float64Array, frame_length: number, hop_length: number, { fft_length, power, center, pad_mode, onesided, preemphasis, mel_filters, mel_floor, log_mel, reference, min_value, db_range, remove_dc_offset, max_num_frames, do_pad, transpose, }?: { +export function spectrogram( + waveform: Float32Array | Float64Array, + window: Float32Array | Float64Array, + frame_length: number, + hop_length: number, + { + fft_length, + power, + center, + pad_mode, + onesided, + preemphasis, + mel_filters, + mel_floor, + log_mel, + reference, + min_value, + db_range, + remove_dc_offset, + max_num_frames, + do_pad, + transpose, + }?: { fft_length?: number; power?: number; center?: boolean; @@ -98,9 +132,10 @@ export function spectrogram(waveform: Float32Array | Float64Array, window: Float max_num_frames?: number; do_pad?: boolean; transpose?: boolean; -}): { - data: Float32Array; - dims: number[]; + }, +): { + data: Float32Array; + dims: number[]; }; /** * Returns an array containing the specified window. @@ -113,9 +148,17 @@ export function spectrogram(waveform: Float32Array | Float64Array, window: Float * @param {boolean} [options.center=true] Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided. * @returns {Float64Array} The window of shape `(window_length,)` or `(frame_length,)`. */ -export function window_function(window_length: number, name: string, { periodic, frame_length, center, }?: { +export function window_function( + window_length: number, + name: string, + { + periodic, + frame_length, + center, + }?: { periodic?: boolean; frame_length?: number; center?: boolean; -}): Float64Array; -//# sourceMappingURL=audio.d.ts.map \ No newline at end of file + }, +): Float64Array; +//# sourceMappingURL=audio.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/utils/core.d.ts b/core/vendor/modules/@xenova/transformers/types/utils/core.d.ts index 6d5e7e0c0..9839c75cb 100644 --- a/core/vendor/modules/@xenova/transformers/types/utils/core.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/utils/core.d.ts @@ -93,7 +93,7 @@ export function calculateReflectOffset(i: number, w: number): number; * @type {new () => {(...args: any[]): any, _call(...args: any[]): any}} */ export const Callable: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; -//# sourceMappingURL=core.d.ts.map \ No newline at end of file +//# sourceMappingURL=core.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/utils/data-structures.d.ts b/core/vendor/modules/@xenova/transformers/types/utils/data-structures.d.ts index 1741f75a2..11f43ca33 100644 --- a/core/vendor/modules/@xenova/transformers/types/utils/data-structures.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/utils/data-structures.d.ts @@ -16,209 +16,215 @@ * - https://github.com/belladoreai/llama-tokenizer-js (minor improvements) */ export class PriorityQueue { - /** - * Create a new PriorityQueue. - * @param {Function} comparator Comparator function to determine priority. Defaults to a MaxHeap. - */ - constructor(comparator?: Function); - _heap: any[]; - _comparator: Function; - /** - * The size of the queue - */ - get size(): number; - /** - * Check if the queue is empty. - * @returns {boolean} `true` if the queue is empty, `false` otherwise. - */ - isEmpty(): boolean; - /** - * Return the element with the highest priority in the queue. - * @returns {any} The highest priority element in the queue. - */ - peek(): any; - /** - * Add one or more elements to the queue. - * @param {...any} values The values to push into the queue. - * @returns {number} The new size of the queue. - */ - push(...values: any[]): number; - /** - * Add multiple elements to the queue. - * @param {any[]} values The values to push into the queue. - * @returns {number} The new size of the queue. - */ - extend(values: any[]): number; - /** - * Remove and return the element with the highest priority in the queue. - * @returns {any} The element with the highest priority in the queue. - */ - pop(): any; - /** - * Replace the element with the highest priority in the queue with a new value. - * @param {*} value The new value. - * @returns {*} The replaced value. - */ - replace(value: any): any; - /** - * Compute the index for the parent of the node at index `i`. - * @param {number} i The index of the node to get the parent of. - * @returns {number} The index of the parent node. - * @private - */ - private _parent; - /** - * Compute the index for the left child of the node at index `i`. - * @param {number} i The index of the node to get the left child of. - * @returns {number} The index of the left child. - * @private - */ - private _left; - /** - * Compute the index for the right child of the node at index `i`. - * @param {number} i The index of the node to get the right child of. - * @returns {number} The index of the right child. - * @private - */ - private _right; - /** - * Check if the element at index `i` is greater than the element at index `j`. - * @param {number} i The index of the first element to compare. - * @param {number} j The index of the second element to compare. - * @returns {boolean} `true` if the element at index `i` is greater than the element at index `j`, `false` otherwise. - * @private - */ - private _greater; - /** - * Swap the elements at indices `i` and `j`. - * @param {number} i The index of the first element to swap. - * @param {number} j The index of the second element to swap. - * @private - */ - private _swap; - /** - * Maintain the heap property by updating positions in the heap, - * starting at the last element and moving up the heap. - * @private - */ - private _siftUp; - /** - * Maintain the heap property by updating positions in the heap, - * starting at the first element and moving down the heap. - * @private - */ - private _siftDown; + /** + * Create a new PriorityQueue. + * @param {Function} comparator Comparator function to determine priority. Defaults to a MaxHeap. + */ + constructor(comparator?: Function); + _heap: any[]; + _comparator: Function; + /** + * The size of the queue + */ + get size(): number; + /** + * Check if the queue is empty. + * @returns {boolean} `true` if the queue is empty, `false` otherwise. + */ + isEmpty(): boolean; + /** + * Return the element with the highest priority in the queue. + * @returns {any} The highest priority element in the queue. + */ + peek(): any; + /** + * Add one or more elements to the queue. + * @param {...any} values The values to push into the queue. + * @returns {number} The new size of the queue. + */ + push(...values: any[]): number; + /** + * Add multiple elements to the queue. + * @param {any[]} values The values to push into the queue. + * @returns {number} The new size of the queue. + */ + extend(values: any[]): number; + /** + * Remove and return the element with the highest priority in the queue. + * @returns {any} The element with the highest priority in the queue. + */ + pop(): any; + /** + * Replace the element with the highest priority in the queue with a new value. + * @param {*} value The new value. + * @returns {*} The replaced value. + */ + replace(value: any): any; + /** + * Compute the index for the parent of the node at index `i`. + * @param {number} i The index of the node to get the parent of. + * @returns {number} The index of the parent node. + * @private + */ + private _parent; + /** + * Compute the index for the left child of the node at index `i`. + * @param {number} i The index of the node to get the left child of. + * @returns {number} The index of the left child. + * @private + */ + private _left; + /** + * Compute the index for the right child of the node at index `i`. + * @param {number} i The index of the node to get the right child of. + * @returns {number} The index of the right child. + * @private + */ + private _right; + /** + * Check if the element at index `i` is greater than the element at index `j`. + * @param {number} i The index of the first element to compare. + * @param {number} j The index of the second element to compare. + * @returns {boolean} `true` if the element at index `i` is greater than the element at index `j`, `false` otherwise. + * @private + */ + private _greater; + /** + * Swap the elements at indices `i` and `j`. + * @param {number} i The index of the first element to swap. + * @param {number} j The index of the second element to swap. + * @private + */ + private _swap; + /** + * Maintain the heap property by updating positions in the heap, + * starting at the last element and moving up the heap. + * @private + */ + private _siftUp; + /** + * Maintain the heap property by updating positions in the heap, + * starting at the first element and moving down the heap. + * @private + */ + private _siftDown; } /** * A trie structure to efficiently store and search for strings. */ export class CharTrie { - root: CharTrieNode; - /** - * Adds one or more `texts` to the trie. - * @param {string[]} texts The strings to add to the trie. - */ - extend(texts: string[]): void; - /** - * Adds text to the trie. - * @param {string} text The string to add to the trie. - */ - push(text: string): void; - /** - * Searches the trie for all strings with a common prefix of `text`. - * @param {string} text The common prefix to search for. - * @yields {string} Each string in the trie that has `text` as a prefix. - */ - commonPrefixSearch(text: string): Generator; + root: CharTrieNode; + /** + * Adds one or more `texts` to the trie. + * @param {string[]} texts The strings to add to the trie. + */ + extend(texts: string[]): void; + /** + * Adds text to the trie. + * @param {string} text The string to add to the trie. + */ + push(text: string): void; + /** + * Searches the trie for all strings with a common prefix of `text`. + * @param {string} text The common prefix to search for. + * @yields {string} Each string in the trie that has `text` as a prefix. + */ + commonPrefixSearch(text: string): Generator; } /** * A lattice data structure to be used for tokenization. */ export class TokenLattice { - /** - * Creates a new TokenLattice instance. - * - * @param {string} sentence The input sentence to be tokenized. - * @param {number} bosTokenId The beginning-of-sequence token ID. - * @param {number} eosTokenId The end-of-sequence token ID. - */ - constructor(sentence: string, bosTokenId: number, eosTokenId: number); - sentence: string; - len: number; - bosTokenId: number; - eosTokenId: number; - nodes: TokenLatticeNode[]; - beginNodes: any[][]; - endNodes: any[][]; - /** - * Inserts a new token node into the token lattice. - * - * @param {number} pos The starting position of the token. - * @param {number} length The length of the token. - * @param {number} score The score of the token. - * @param {number} tokenId The token ID of the token. - */ - insert(pos: number, length: number, score: number, tokenId: number): void; - /** - * Implements the Viterbi algorithm to compute the most likely sequence of tokens. - * - * @returns {TokenLatticeNode[]} The array of nodes representing the most likely sequence of tokens. - */ - viterbi(): TokenLatticeNode[]; - /** - * @param {TokenLatticeNode} node - * @returns {string} The array of nodes representing the most likely sequence of tokens. - */ - piece(node: TokenLatticeNode): string; - /** - * @returns {Array} The array of nodes representing the most likely sequence of tokens. - */ - tokens(): any[]; - /** - * @returns {Array} The array of nodes representing the most likely sequence of tokens. - */ - tokenIds(): any[]; + /** + * Creates a new TokenLattice instance. + * + * @param {string} sentence The input sentence to be tokenized. + * @param {number} bosTokenId The beginning-of-sequence token ID. + * @param {number} eosTokenId The end-of-sequence token ID. + */ + constructor(sentence: string, bosTokenId: number, eosTokenId: number); + sentence: string; + len: number; + bosTokenId: number; + eosTokenId: number; + nodes: TokenLatticeNode[]; + beginNodes: any[][]; + endNodes: any[][]; + /** + * Inserts a new token node into the token lattice. + * + * @param {number} pos The starting position of the token. + * @param {number} length The length of the token. + * @param {number} score The score of the token. + * @param {number} tokenId The token ID of the token. + */ + insert(pos: number, length: number, score: number, tokenId: number): void; + /** + * Implements the Viterbi algorithm to compute the most likely sequence of tokens. + * + * @returns {TokenLatticeNode[]} The array of nodes representing the most likely sequence of tokens. + */ + viterbi(): TokenLatticeNode[]; + /** + * @param {TokenLatticeNode} node + * @returns {string} The array of nodes representing the most likely sequence of tokens. + */ + piece(node: TokenLatticeNode): string; + /** + * @returns {Array} The array of nodes representing the most likely sequence of tokens. + */ + tokens(): any[]; + /** + * @returns {Array} The array of nodes representing the most likely sequence of tokens. + */ + tokenIds(): any[]; } /** * Represents a node in a character trie. */ declare class CharTrieNode { - /** - * Returns a new `CharTrieNode` instance with default values. - * @returns {CharTrieNode} A new `CharTrieNode` instance with `isLeaf` set to `false` and an empty `children` map. - */ - static default(): CharTrieNode; - /** - * Create a new CharTrieNode. - * @param {boolean} isLeaf Whether the node is a leaf node or not. - * @param {Map} children A map containing the node's children, where the key is a character and the value is a `CharTrieNode`. - */ - constructor(isLeaf: boolean, children: Map); - isLeaf: boolean; - children: Map; + /** + * Returns a new `CharTrieNode` instance with default values. + * @returns {CharTrieNode} A new `CharTrieNode` instance with `isLeaf` set to `false` and an empty `children` map. + */ + static default(): CharTrieNode; + /** + * Create a new CharTrieNode. + * @param {boolean} isLeaf Whether the node is a leaf node or not. + * @param {Map} children A map containing the node's children, where the key is a character and the value is a `CharTrieNode`. + */ + constructor(isLeaf: boolean, children: Map); + isLeaf: boolean; + children: Map; } declare class TokenLatticeNode { - /** - * Represents a node in a token lattice for a given sentence. - * @param {number} tokenId The ID of the token associated with this node. - * @param {number} nodeId The ID of this node. - * @param {number} pos The starting position of the token in the sentence. - * @param {number} length The length of the token. - * @param {number} score The score associated with the token. - */ - constructor(tokenId: number, nodeId: number, pos: number, length: number, score: number); - tokenId: number; - nodeId: number; - pos: number; - length: number; - score: number; - prev: any; - backtraceScore: number; - /** - * Returns a clone of this node. - * @returns {TokenLatticeNode} A clone of this node. - */ - clone(): TokenLatticeNode; + /** + * Represents a node in a token lattice for a given sentence. + * @param {number} tokenId The ID of the token associated with this node. + * @param {number} nodeId The ID of this node. + * @param {number} pos The starting position of the token in the sentence. + * @param {number} length The length of the token. + * @param {number} score The score associated with the token. + */ + constructor( + tokenId: number, + nodeId: number, + pos: number, + length: number, + score: number, + ); + tokenId: number; + nodeId: number; + pos: number; + length: number; + score: number; + prev: any; + backtraceScore: number; + /** + * Returns a clone of this node. + * @returns {TokenLatticeNode} A clone of this node. + */ + clone(): TokenLatticeNode; } export {}; -//# sourceMappingURL=data-structures.d.ts.map \ No newline at end of file +//# sourceMappingURL=data-structures.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/utils/generation.d.ts b/core/vendor/modules/@xenova/transformers/types/utils/generation.d.ts index 49ea01a51..e7334c40f 100644 --- a/core/vendor/modules/@xenova/transformers/types/utils/generation.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/utils/generation.d.ts @@ -1,6 +1,6 @@ declare const LogitsProcessorList_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * A class representing a list of logits processors. A logits processor is a function that modifies the logits @@ -10,47 +10,47 @@ declare const LogitsProcessorList_base: new () => { * @extends Callable */ export class LogitsProcessorList extends LogitsProcessorList_base { - processors: any[]; - /** - * Adds a new logits processor to the list. - * - * @param {LogitsProcessor} item The logits processor function to add. - */ - push(item: LogitsProcessor): void; - /** - * Adds multiple logits processors to the list. - * - * @param {LogitsProcessor[]} items The logits processor functions to add. - */ - extend(items: LogitsProcessor[]): void; - /** - * Applies all logits processors in the list to a batch of logits, modifying them in-place. - * - * @param {number[]} input_ids The input IDs for the language model. - * @param {number[][]} batchedLogits A 2D array of logits, where each row corresponds to a single - * input sequence in the batch. - */ - _call(input_ids: number[], batchedLogits: number[][]): void; - [Symbol.iterator](): IterableIterator; + processors: any[]; + /** + * Adds a new logits processor to the list. + * + * @param {LogitsProcessor} item The logits processor function to add. + */ + push(item: LogitsProcessor): void; + /** + * Adds multiple logits processors to the list. + * + * @param {LogitsProcessor[]} items The logits processor functions to add. + */ + extend(items: LogitsProcessor[]): void; + /** + * Applies all logits processors in the list to a batch of logits, modifying them in-place. + * + * @param {number[]} input_ids The input IDs for the language model. + * @param {number[][]} batchedLogits A 2D array of logits, where each row corresponds to a single + * input sequence in the batch. + */ + _call(input_ids: number[], batchedLogits: number[][]): void; + [Symbol.iterator](): IterableIterator; } declare const LogitsProcessor_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * Base class for processing logits. * @extends Callable */ export class LogitsProcessor extends LogitsProcessor_base { - /** - * Apply the processor to the input logits. - * - * @abstract - * @param {Array} input_ids The input ids. - * @param {Tensor} logits The logits to process. - * @throws {Error} Throws an error if `_call` is not implemented in the subclass. - */ - _call(input_ids: any[], logits: Tensor): void; + /** + * Apply the processor to the input logits. + * + * @abstract + * @param {Array} input_ids The input ids. + * @param {Tensor} logits The logits to process. + * @throws {Error} Throws an error if `_call` is not implemented in the subclass. + */ + _call(input_ids: any[], logits: Tensor): void; } /** * A logits processor that forces a specific token to be generated by the decoder. @@ -58,42 +58,42 @@ export class LogitsProcessor extends LogitsProcessor_base { * @extends LogitsProcessor */ export class ForceTokensLogitsProcessor extends LogitsProcessor { - /** - * Constructs a new instance of `ForceTokensLogitsProcessor`. - * - * @param {Array} forced_decoder_ids The ids of tokens that should be forced. - */ - constructor(forced_decoder_ids: any[]); - force_token_map: { - [k: string]: any; - }; - /** - * Apply the processor to the input logits. - * - * @param {Array} input_ids The input ids. - * @param {Tensor} logits The logits to process. - * @returns {Tensor} The processed logits. - */ - _call(input_ids: any[], logits: Tensor): Tensor; + /** + * Constructs a new instance of `ForceTokensLogitsProcessor`. + * + * @param {Array} forced_decoder_ids The ids of tokens that should be forced. + */ + constructor(forced_decoder_ids: any[]); + force_token_map: { + [k: string]: any; + }; + /** + * Apply the processor to the input logits. + * + * @param {Array} input_ids The input ids. + * @param {Tensor} logits The logits to process. + * @returns {Tensor} The processed logits. + */ + _call(input_ids: any[], logits: Tensor): Tensor; } /** * A LogitsProcessor that forces a BOS token at the beginning of the generated sequence. * @extends LogitsProcessor */ export class ForcedBOSTokenLogitsProcessor extends LogitsProcessor { - /** - * Create a ForcedBOSTokenLogitsProcessor. - * @param {number} bos_token_id The ID of the beginning-of-sequence token to be forced. - */ - constructor(bos_token_id: number); - bos_token_id: number; - /** - * Apply the BOS token forcing to the logits. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The logits with BOS token forcing. - */ - _call(input_ids: any[], logits: any): any; + /** + * Create a ForcedBOSTokenLogitsProcessor. + * @param {number} bos_token_id The ID of the beginning-of-sequence token to be forced. + */ + constructor(bos_token_id: number); + bos_token_id: number; + /** + * Apply the BOS token forcing to the logits. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The logits with BOS token forcing. + */ + _call(input_ids: any[], logits: any): any; } /** * A logits processor that forces end-of-sequence token probability to 1. @@ -101,21 +101,21 @@ export class ForcedBOSTokenLogitsProcessor extends LogitsProcessor { * @extends LogitsProcessor */ export class ForcedEOSTokenLogitsProcessor extends LogitsProcessor { - /** - * Create a ForcedEOSTokenLogitsProcessor. - * @param {number} max_length Max length of the sequence. - * @param {number|number[]} forced_eos_token_id The ID of the end-of-sequence token to be forced. - */ - constructor(max_length: number, forced_eos_token_id: number | number[]); - max_length: number; - forced_eos_token_id: number | number[]; - /** - * Apply the processor to input_ids and logits. - * - * @param {number[]} input_ids The input ids. - * @param {Tensor} logits The logits tensor. - */ - _call(input_ids: number[], logits: Tensor): void; + /** + * Create a ForcedEOSTokenLogitsProcessor. + * @param {number} max_length Max length of the sequence. + * @param {number|number[]} forced_eos_token_id The ID of the end-of-sequence token to be forced. + */ + constructor(max_length: number, forced_eos_token_id: number | number[]); + max_length: number; + forced_eos_token_id: number | number[]; + /** + * Apply the processor to input_ids and logits. + * + * @param {number[]} input_ids The input ids. + * @param {Tensor} logits The logits tensor. + */ + _call(input_ids: number[], logits: Tensor): void; } /** * A LogitsProcessor that suppresses a list of tokens as soon as the `generate` function starts @@ -124,53 +124,53 @@ export class ForcedEOSTokenLogitsProcessor extends LogitsProcessor { * @extends LogitsProcessor */ export class SuppressTokensAtBeginLogitsProcessor extends LogitsProcessor { - /** - * Create a SuppressTokensAtBeginLogitsProcessor. - * @param {number[]} begin_suppress_tokens The IDs of the tokens to suppress. - * @param {number} begin_index The number of tokens to generate before suppressing tokens. - */ - constructor(begin_suppress_tokens: number[], begin_index: number); - begin_suppress_tokens: number[]; - begin_index: number; - /** - * Apply the BOS token forcing to the logits. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The logits with BOS token forcing. - */ - _call(input_ids: any[], logits: any): any; + /** + * Create a SuppressTokensAtBeginLogitsProcessor. + * @param {number[]} begin_suppress_tokens The IDs of the tokens to suppress. + * @param {number} begin_index The number of tokens to generate before suppressing tokens. + */ + constructor(begin_suppress_tokens: number[], begin_index: number); + begin_suppress_tokens: number[]; + begin_index: number; + /** + * Apply the BOS token forcing to the logits. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The logits with BOS token forcing. + */ + _call(input_ids: any[], logits: any): any; } /** * A LogitsProcessor that handles adding timestamps to generated text. * @extends LogitsProcessor */ export class WhisperTimeStampLogitsProcessor extends LogitsProcessor { - /** - * Constructs a new WhisperTimeStampLogitsProcessor. - * @param {Object} generate_config The config object passed to the `generate()` method of a transformer model. - * @param {number} generate_config.eos_token_id The ID of the end-of-sequence token. - * @param {number} generate_config.no_timestamps_token_id The ID of the token used to indicate that a token should not have a timestamp. - * @param {number[][]} [generate_config.forced_decoder_ids] An array of two-element arrays representing decoder IDs that are forced to appear in the output. The second element of each array indicates whether the token is a timestamp. - * @param {number} [generate_config.max_initial_timestamp_index] The maximum index at which an initial timestamp can appear. - */ - constructor(generate_config: { - eos_token_id: number; - no_timestamps_token_id: number; - forced_decoder_ids?: number[][]; - max_initial_timestamp_index?: number; - }); + /** + * Constructs a new WhisperTimeStampLogitsProcessor. + * @param {Object} generate_config The config object passed to the `generate()` method of a transformer model. + * @param {number} generate_config.eos_token_id The ID of the end-of-sequence token. + * @param {number} generate_config.no_timestamps_token_id The ID of the token used to indicate that a token should not have a timestamp. + * @param {number[][]} [generate_config.forced_decoder_ids] An array of two-element arrays representing decoder IDs that are forced to appear in the output. The second element of each array indicates whether the token is a timestamp. + * @param {number} [generate_config.max_initial_timestamp_index] The maximum index at which an initial timestamp can appear. + */ + constructor(generate_config: { eos_token_id: number; no_timestamps_token_id: number; - timestamp_begin: number; - begin_index: number; - max_initial_timestamp_index: number; - /** - * Modify the logits to handle timestamp tokens. - * @param {Array} input_ids The input sequence of tokens. - * @param {Tensor} logits The logits output by the model. - * @returns {Tensor} The modified logits. - */ - _call(input_ids: any[], logits: Tensor): Tensor; + forced_decoder_ids?: number[][]; + max_initial_timestamp_index?: number; + }); + eos_token_id: number; + no_timestamps_token_id: number; + timestamp_begin: number; + begin_index: number; + max_initial_timestamp_index: number; + /** + * Modify the logits to handle timestamp tokens. + * @param {Array} input_ids The input sequence of tokens. + * @param {Tensor} logits The logits output by the model. + * @returns {Tensor} The modified logits. + */ + _call(input_ids: any[], logits: Tensor): Tensor; } /** * A logits processor that disallows ngrams of a certain size to be repeated. @@ -178,38 +178,41 @@ export class WhisperTimeStampLogitsProcessor extends LogitsProcessor { * @extends LogitsProcessor */ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor { - /** - * Create a NoRepeatNGramLogitsProcessor. - * @param {number} no_repeat_ngram_size The no-repeat-ngram size. All ngrams of this size can only occur once. - */ - constructor(no_repeat_ngram_size: number); - no_repeat_ngram_size: number; - /** - * Generate n-grams from a sequence of token ids. - * @param {number[]} prevInputIds List of previous input ids - * @returns {Map} Map of generated n-grams - */ - getNgrams(prevInputIds: number[]): Map; - /** - * Generate n-grams from a sequence of token ids. - * @param {Map} bannedNgrams Map of banned n-grams - * @param {number[]} prevInputIds List of previous input ids - * @returns {number[]} Map of generated n-grams - */ - getGeneratedNgrams(bannedNgrams: Map, prevInputIds: number[]): number[]; - /** - * Calculate banned n-gram tokens - * @param {number[]} prevInputIds List of previous input ids - * @returns {number[]} Map of generated n-grams - */ - calcBannedNgramTokens(prevInputIds: number[]): number[]; - /** - * Apply the no-repeat-ngram processor to the logits. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The logits with no-repeat-ngram processing. - */ - _call(input_ids: any[], logits: any): any; + /** + * Create a NoRepeatNGramLogitsProcessor. + * @param {number} no_repeat_ngram_size The no-repeat-ngram size. All ngrams of this size can only occur once. + */ + constructor(no_repeat_ngram_size: number); + no_repeat_ngram_size: number; + /** + * Generate n-grams from a sequence of token ids. + * @param {number[]} prevInputIds List of previous input ids + * @returns {Map} Map of generated n-grams + */ + getNgrams(prevInputIds: number[]): Map; + /** + * Generate n-grams from a sequence of token ids. + * @param {Map} bannedNgrams Map of banned n-grams + * @param {number[]} prevInputIds List of previous input ids + * @returns {number[]} Map of generated n-grams + */ + getGeneratedNgrams( + bannedNgrams: Map, + prevInputIds: number[], + ): number[]; + /** + * Calculate banned n-gram tokens + * @param {number[]} prevInputIds List of previous input ids + * @returns {number[]} Map of generated n-grams + */ + calcBannedNgramTokens(prevInputIds: number[]): number[]; + /** + * Apply the no-repeat-ngram processor to the logits. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The logits with no-repeat-ngram processing. + */ + _call(input_ids: any[], logits: any): any; } /** * A logits processor that penalises repeated output tokens. @@ -217,19 +220,19 @@ export class NoRepeatNGramLogitsProcessor extends LogitsProcessor { * @extends LogitsProcessor */ export class RepetitionPenaltyLogitsProcessor extends LogitsProcessor { - /** - * Create a RepetitionPenaltyLogitsProcessor. - * @param {number} penalty The penalty to apply for repeated tokens. - */ - constructor(penalty: number); - penalty: number; - /** - * Apply the repetition penalty to the logits. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The logits with repetition penalty processing. - */ - _call(input_ids: any[], logits: any): any; + /** + * Create a RepetitionPenaltyLogitsProcessor. + * @param {number} penalty The penalty to apply for repeated tokens. + */ + constructor(penalty: number); + penalty: number; + /** + * Apply the repetition penalty to the logits. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The logits with repetition penalty processing. + */ + _call(input_ids: any[], logits: any): any; } /** * A logits processor that enforces a minimum number of tokens. @@ -237,21 +240,21 @@ export class RepetitionPenaltyLogitsProcessor extends LogitsProcessor { * @extends LogitsProcessor */ export class MinLengthLogitsProcessor extends LogitsProcessor { - /** - * Create a MinLengthLogitsProcessor. - * @param {number} min_length The minimum length below which the score of `eos_token_id` is set to negative infinity. - * @param {number|number[]} eos_token_id The ID/IDs of the end-of-sequence token. - */ - constructor(min_length: number, eos_token_id: number | number[]); - min_length: number; - eos_token_id: number[]; - /** - * Apply logit processor. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The processed logits. - */ - _call(input_ids: any[], logits: any): any; + /** + * Create a MinLengthLogitsProcessor. + * @param {number} min_length The minimum length below which the score of `eos_token_id` is set to negative infinity. + * @param {number|number[]} eos_token_id The ID/IDs of the end-of-sequence token. + */ + constructor(min_length: number, eos_token_id: number | number[]); + min_length: number; + eos_token_id: number[]; + /** + * Apply logit processor. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The processed logits. + */ + _call(input_ids: any[], logits: any): any; } /** * A logits processor that enforces a minimum number of new tokens. @@ -259,40 +262,44 @@ export class MinLengthLogitsProcessor extends LogitsProcessor { * @extends LogitsProcessor */ export class MinNewTokensLengthLogitsProcessor extends LogitsProcessor { - /** - * Create a MinNewTokensLengthLogitsProcessor. - * @param {number} prompt_length_to_skip The input tokens length. - * @param {number} min_new_tokens The minimum *new* tokens length below which the score of `eos_token_id` is set to negative infinity. - * @param {number|number[]} eos_token_id The ID/IDs of the end-of-sequence token. - */ - constructor(prompt_length_to_skip: number, min_new_tokens: number, eos_token_id: number | number[]); - prompt_length_to_skip: number; - min_new_tokens: number; - eos_token_id: number[]; - /** - * Apply logit processor. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The processed logits. - */ - _call(input_ids: any[], logits: any): any; + /** + * Create a MinNewTokensLengthLogitsProcessor. + * @param {number} prompt_length_to_skip The input tokens length. + * @param {number} min_new_tokens The minimum *new* tokens length below which the score of `eos_token_id` is set to negative infinity. + * @param {number|number[]} eos_token_id The ID/IDs of the end-of-sequence token. + */ + constructor( + prompt_length_to_skip: number, + min_new_tokens: number, + eos_token_id: number | number[], + ); + prompt_length_to_skip: number; + min_new_tokens: number; + eos_token_id: number[]; + /** + * Apply logit processor. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The processed logits. + */ + _call(input_ids: any[], logits: any): any; } export class NoBadWordsLogitsProcessor extends LogitsProcessor { - /** - * Create a `NoBadWordsLogitsProcessor`. - * @param {number[][]} bad_words_ids List of list of token ids that are not allowed to be generated. - * @param {number|number[]} eos_token_id The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - */ - constructor(bad_words_ids: number[][], eos_token_id: number | number[]); - bad_words_ids: number[][]; - eos_token_id: number[]; - /** - * Apply logit processor. - * @param {Array} input_ids The input IDs. - * @param {Object} logits The logits. - * @returns {Object} The processed logits. - */ - _call(input_ids: any[], logits: any): any; + /** + * Create a `NoBadWordsLogitsProcessor`. + * @param {number[][]} bad_words_ids List of list of token ids that are not allowed to be generated. + * @param {number|number[]} eos_token_id The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + */ + constructor(bad_words_ids: number[][], eos_token_id: number | number[]); + bad_words_ids: number[][]; + eos_token_id: number[]; + /** + * Apply logit processor. + * @param {Array} input_ids The input IDs. + * @param {Object} logits The logits. + * @returns {Object} The processed logits. + */ + _call(input_ids: any[], logits: any): any; } /** * @typedef {Object} GenerationConfigType The default configuration parameters. @@ -355,239 +362,241 @@ export class NoBadWordsLogitsProcessor extends LogitsProcessor { * Class that holds a configuration for a generation task. * @type {new (kwargs?: GenerationConfigType) => GenerationConfigType} */ -export const GenerationConfig: new (kwargs?: GenerationConfigType) => GenerationConfigType; +export const GenerationConfig: new ( + kwargs?: GenerationConfigType, +) => GenerationConfigType; declare const Sampler_base: new () => { - (...args: any[]): any; - _call(...args: any[]): any; + (...args: any[]): any; + _call(...args: any[]): any; }; /** * Sampler is a base class for all sampling methods used for text generation. */ export class Sampler extends Sampler_base { - /** - * Returns a Sampler object based on the specified options. - * @param {GenerationConfigType} generation_config An object containing options for the sampler. - * @returns {Sampler} A Sampler object. - */ - static getSampler(generation_config: GenerationConfigType): Sampler; - /** - * Creates a new Sampler object with the specified generation config. - * @param {GenerationConfigType} generation_config The generation config. - */ - constructor(generation_config: GenerationConfigType); - generation_config: GenerationConfigType; - /** - * Executes the sampler, using the specified logits. - * @param {Tensor} logits - * @param {number} index - * @returns {void} - */ - _call(logits: Tensor, index?: number): void; - /** - * Abstract method for sampling the logits. - * @param {Tensor} logits - * @param {number} index - * @throws {Error} - */ - sample(logits: Tensor, index: number): void; - /** - * Returns the specified logits as an array, with temperature applied. - * @param {Tensor} logits - * @param {number} index - * @returns {Float32Array} - */ - getLogits(logits: Tensor, index: number): Float32Array; - /** - * Selects an item randomly based on the specified probabilities. - * @param {Array} probabilities An array of probabilities to use for selection. - * @returns {number} The index of the selected item. - */ - randomSelect(probabilities: any[]): number; + /** + * Returns a Sampler object based on the specified options. + * @param {GenerationConfigType} generation_config An object containing options for the sampler. + * @returns {Sampler} A Sampler object. + */ + static getSampler(generation_config: GenerationConfigType): Sampler; + /** + * Creates a new Sampler object with the specified generation config. + * @param {GenerationConfigType} generation_config The generation config. + */ + constructor(generation_config: GenerationConfigType); + generation_config: GenerationConfigType; + /** + * Executes the sampler, using the specified logits. + * @param {Tensor} logits + * @param {number} index + * @returns {void} + */ + _call(logits: Tensor, index?: number): void; + /** + * Abstract method for sampling the logits. + * @param {Tensor} logits + * @param {number} index + * @throws {Error} + */ + sample(logits: Tensor, index: number): void; + /** + * Returns the specified logits as an array, with temperature applied. + * @param {Tensor} logits + * @param {number} index + * @returns {Float32Array} + */ + getLogits(logits: Tensor, index: number): Float32Array; + /** + * Selects an item randomly based on the specified probabilities. + * @param {Array} probabilities An array of probabilities to use for selection. + * @returns {number} The index of the selected item. + */ + randomSelect(probabilities: any[]): number; } /** * The default configuration parameters. */ export type GenerationConfigType = { - /** - * The maximum length the generated tokens can have. Corresponds to the length of the input prompt + `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. - */ - max_length?: number; - /** - * The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. - */ - max_new_tokens?: number; - /** - * The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set. - */ - min_length?: number; - /** - * The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt. - */ - min_new_tokens?: number; - /** - * Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values: - * - `true`, where the generation stops as soon as there are `num_beams` complete candidates; - * - `false`, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates; - * - `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - */ - early_stopping?: boolean | "never"; - /** - * The maximum amount of time you allow the computation to run for in seconds. Generation will still finish the current pass after allocated time has been passed. - */ - max_time?: number; - /** - * Whether or not to use sampling; use greedy decoding otherwise. - */ - do_sample?: boolean; - /** - * Number of beams for beam search. 1 means no beam search. - */ - num_beams?: number; - /** - * Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details. - */ - num_beam_groups?: number; - /** - * The values balance the model confidence and the degeneration penalty in contrastive search decoding. - */ - penalty_alpha?: number; - /** - * Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. - */ - use_cache?: boolean; - /** - * The value used to modulate the next token probabilities. - */ - temperature?: number; - /** - * The number of highest probability vocabulary tokens to keep for top-k-filtering. - */ - top_k?: number; - /** - * If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation. - */ - top_p?: number; - /** - * Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to `typical_p` or higher are kept for generation. See [this paper](https://arxiv.org/pdf/2202.00666.pdf) for more details. - */ - typical_p?: number; - /** - * If set to float strictly between 0 and 1, only tokens with a conditional probability greater than `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. - */ - epsilon_cutoff?: number; - /** - * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. - */ - eta_cutoff?: number; - /** - * This value is subtracted from a beam's score if it generates a token same as any beam from other group at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. - */ - diversity_penalty?: number; - /** - * The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. - */ - repetition_penalty?: number; - /** - * The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty. - */ - encoder_repetition_penalty?: number; - /** - * Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences. - */ - length_penalty?: number; - /** - * If set to int > 0, all ngrams of that size can only occur once. - */ - no_repeat_ngram_size?: number; - /** - * List of token ids that are not allowed to be generated. In order to get the token ids of the words that should not appear in the generated text, use `(await tokenizer(bad_words, {add_prefix_space: true, add_special_tokens: false})).input_ids`. - */ - bad_words_ids?: number[][]; - /** - * List of token ids that must be generated. If given a `number[][]`, this is treated as a simple list of words that must be included, the opposite to `bad_words_ids`. If given `number[][][]`, this triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one can allow different forms of each word. - */ - force_words_ids?: number[][] | number[][][]; - /** - * Whether to renormalize the logits after applying all the logits processors or warpers (including the custom ones). It's highly recommended to set this flag to `true` as the search algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization. - */ - renormalize_logits?: boolean; - /** - * Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by `Constraint` objects, in the most sensible way possible. - */ - constraints?: any[]; - /** - * The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for multilingual models like mBART where the first generated token needs to be the target language token. - */ - forced_bos_token_id?: number; - /** - * The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens. - */ - forced_eos_token_id?: number | number[]; - /** - * Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation. - */ - remove_invalid_values?: boolean; - /** - * This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty starts and `decay_factor` represents the factor of exponential decay. - */ - exponential_decay_length_penalty?: number[]; - /** - * A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled. - */ - suppress_tokens?: number[]; - /** - * A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. - */ - begin_suppress_tokens?: number[]; - /** - * A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123. - */ - forced_decoder_ids?: number[][]; - /** - * The number of independently computed returned sequences for each element in the batch. - */ - num_return_sequences?: number; - /** - * Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details. - */ - output_attentions?: boolean; - /** - * Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details. - */ - output_hidden_states?: boolean; - /** - * Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - */ - output_scores?: boolean; - /** - * Whether or not to return a `ModelOutput` instead of a plain tuple. - */ - return_dict_in_generate?: boolean; - /** - * The id of the *padding* token. - */ - pad_token_id?: number; - /** - * The id of the *beginning-of-sequence* token. - */ - bos_token_id?: number; - /** - * The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - */ - eos_token_id?: number | number[]; - /** - * If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`. - */ - encoder_no_repeat_ngram_size?: number; - /** - * If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. - */ - decoder_start_token_id?: number; - /** - * Additional generation kwargs will be forwarded to the `generate` function of the model. Kwargs that are not present in `generate`'s signature will be used in the model forward pass. - */ - generation_kwargs?: any; + /** + * The maximum length the generated tokens can have. Corresponds to the length of the input prompt + `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + */ + max_length?: number; + /** + * The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. + */ + max_new_tokens?: number; + /** + * The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set. + */ + min_length?: number; + /** + * The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt. + */ + min_new_tokens?: number; + /** + * Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values: + * - `true`, where the generation stops as soon as there are `num_beams` complete candidates; + * - `false`, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates; + * - `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + */ + early_stopping?: boolean | "never"; + /** + * The maximum amount of time you allow the computation to run for in seconds. Generation will still finish the current pass after allocated time has been passed. + */ + max_time?: number; + /** + * Whether or not to use sampling; use greedy decoding otherwise. + */ + do_sample?: boolean; + /** + * Number of beams for beam search. 1 means no beam search. + */ + num_beams?: number; + /** + * Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details. + */ + num_beam_groups?: number; + /** + * The values balance the model confidence and the degeneration penalty in contrastive search decoding. + */ + penalty_alpha?: number; + /** + * Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. + */ + use_cache?: boolean; + /** + * The value used to modulate the next token probabilities. + */ + temperature?: number; + /** + * The number of highest probability vocabulary tokens to keep for top-k-filtering. + */ + top_k?: number; + /** + * If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation. + */ + top_p?: number; + /** + * Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to `typical_p` or higher are kept for generation. See [this paper](https://arxiv.org/pdf/2202.00666.pdf) for more details. + */ + typical_p?: number; + /** + * If set to float strictly between 0 and 1, only tokens with a conditional probability greater than `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. + */ + epsilon_cutoff?: number; + /** + * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. + */ + eta_cutoff?: number; + /** + * This value is subtracted from a beam's score if it generates a token same as any beam from other group at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. + */ + diversity_penalty?: number; + /** + * The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + */ + repetition_penalty?: number; + /** + * The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty. + */ + encoder_repetition_penalty?: number; + /** + * Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences. + */ + length_penalty?: number; + /** + * If set to int > 0, all ngrams of that size can only occur once. + */ + no_repeat_ngram_size?: number; + /** + * List of token ids that are not allowed to be generated. In order to get the token ids of the words that should not appear in the generated text, use `(await tokenizer(bad_words, {add_prefix_space: true, add_special_tokens: false})).input_ids`. + */ + bad_words_ids?: number[][]; + /** + * List of token ids that must be generated. If given a `number[][]`, this is treated as a simple list of words that must be included, the opposite to `bad_words_ids`. If given `number[][][]`, this triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one can allow different forms of each word. + */ + force_words_ids?: number[][] | number[][][]; + /** + * Whether to renormalize the logits after applying all the logits processors or warpers (including the custom ones). It's highly recommended to set this flag to `true` as the search algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization. + */ + renormalize_logits?: boolean; + /** + * Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by `Constraint` objects, in the most sensible way possible. + */ + constraints?: any[]; + /** + * The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for multilingual models like mBART where the first generated token needs to be the target language token. + */ + forced_bos_token_id?: number; + /** + * The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens. + */ + forced_eos_token_id?: number | number[]; + /** + * Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation. + */ + remove_invalid_values?: boolean; + /** + * This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty starts and `decay_factor` represents the factor of exponential decay. + */ + exponential_decay_length_penalty?: number[]; + /** + * A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled. + */ + suppress_tokens?: number[]; + /** + * A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. + */ + begin_suppress_tokens?: number[]; + /** + * A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123. + */ + forced_decoder_ids?: number[][]; + /** + * The number of independently computed returned sequences for each element in the batch. + */ + num_return_sequences?: number; + /** + * Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details. + */ + output_attentions?: boolean; + /** + * Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details. + */ + output_hidden_states?: boolean; + /** + * Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + */ + output_scores?: boolean; + /** + * Whether or not to return a `ModelOutput` instead of a plain tuple. + */ + return_dict_in_generate?: boolean; + /** + * The id of the *padding* token. + */ + pad_token_id?: number; + /** + * The id of the *beginning-of-sequence* token. + */ + bos_token_id?: number; + /** + * The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + */ + eos_token_id?: number | number[]; + /** + * If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`. + */ + encoder_no_repeat_ngram_size?: number; + /** + * If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. + */ + decoder_start_token_id?: number; + /** + * Additional generation kwargs will be forwarded to the `generate` function of the model. Kwargs that are not present in `generate`'s signature will be used in the model forward pass. + */ + generation_kwargs?: any; }; -import { Tensor } from './tensor.js'; +import { Tensor } from "./tensor.js"; export {}; //# sourceMappingURL=generation.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/utils/hub.d.ts b/core/vendor/modules/@xenova/transformers/types/utils/hub.d.ts index cfd26cbf7..118d2e4c9 100644 --- a/core/vendor/modules/@xenova/transformers/types/utils/hub.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/utils/hub.d.ts @@ -4,7 +4,9 @@ * @param {URL|string} urlOrPath The URL/path of the file to get. * @returns {Promise} A promise that resolves to a FileResponse object (if the file is retrieved using the FileSystem API), or a Response object (if the file is retrieved using the Fetch API). */ -export function getFile(urlOrPath: URL | string): Promise; +export function getFile( + urlOrPath: URL | string, +): Promise; /** * * Retrieves a file from either a remote URL using the Fetch API or from the local file system using the FileSystem API. @@ -20,7 +22,12 @@ export function getFile(urlOrPath: URL | string): Promise; +export function getModelFile( + path_or_repo_id: string, + filename: string, + fatal?: boolean, + options?: PretrainedOptions, +): Promise; /** * Fetches a JSON file from a given path and file name. * @@ -31,43 +38,48 @@ export function getModelFile(path_or_repo_id: string, filename: string, fatal?: * @returns {Promise} The JSON data parsed into a JavaScript object. * @throws Will throw an error if the file is not found and `fatal` is true. */ -export function getModelJSON(modelPath: string, fileName: string, fatal?: boolean, options?: PretrainedOptions): Promise; +export function getModelJSON( + modelPath: string, + fileName: string, + fatal?: boolean, + options?: PretrainedOptions, +): Promise; /** * Options for loading a pretrained model. */ export type PretrainedOptions = { - /** - * Whether to load the 8-bit quantized version of the model (only applicable when loading model files). - */ - quantized?: boolean | null; - /** - * If specified, this function will be called during model construction, to provide the user with progress updates. - */ - progress_callback?: Function; - /** - * Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when: - * - The model is a model provided by the library (loaded with the *model id* string of a pretrained model). - * - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory. - */ - config?: any; - /** - * Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used. - */ - cache_dir?: string; - /** - * Whether or not to only look at local files (e.g., not try downloading the model). - */ - local_files_only?: boolean; - /** - * The specific model version to use. It can be a branch name, a tag name, or a commit id, - * since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. - * NOTE: This setting is ignored for local requests. - */ - revision?: string; - /** - * If specified, load the model with this name (excluding the .onnx suffix). Currently only valid for encoder- or decoder-only models. - */ - model_file_name?: string; + /** + * Whether to load the 8-bit quantized version of the model (only applicable when loading model files). + */ + quantized?: boolean | null; + /** + * If specified, this function will be called during model construction, to provide the user with progress updates. + */ + progress_callback?: Function; + /** + * Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when: + * - The model is a model provided by the library (loaded with the *model id* string of a pretrained model). + * - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory. + */ + config?: any; + /** + * Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used. + */ + cache_dir?: string; + /** + * Whether or not to only look at local files (e.g., not try downloading the model). + */ + local_files_only?: boolean; + /** + * The specific model version to use. It can be a branch name, a tag name, or a commit id, + * since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. + * NOTE: This setting is ignored for local requests. + */ + revision?: string; + /** + * If specified, load the model with this name (excluding the .onnx suffix). Currently only valid for encoder- or decoder-only models. + */ + model_file_name?: string; }; /** * @typedef {Object} PretrainedOptions Options for loading a pretrained model. @@ -84,71 +96,71 @@ export type PretrainedOptions = { * @property {string} [model_file_name=null] If specified, load the model with this name (excluding the .onnx suffix). Currently only valid for encoder- or decoder-only models. */ declare class FileResponse { - /** - * Creates a new `FileResponse` object. - * @param {string|URL} filePath - */ - constructor(filePath: string | URL); - /** - * Mapping from file extensions to MIME types. - */ - _CONTENT_TYPE_MAP: { - txt: string; - html: string; - css: string; - js: string; - json: string; - png: string; - jpg: string; - jpeg: string; - gif: string; - }; - filePath: string | URL; - headers: Headers; - exists: any; - status: number; - statusText: string; - body: ReadableStream; - /** - * Updates the 'content-type' header property of the response based on the extension of - * the file specified by the filePath property of the current object. - * @returns {void} - */ - updateContentType(): void; - /** - * Clone the current FileResponse object. - * @returns {FileResponse} A new FileResponse object with the same properties as the current object. - */ - clone(): FileResponse; - /** - * Reads the contents of the file specified by the filePath property and returns a Promise that - * resolves with an ArrayBuffer containing the file's contents. - * @returns {Promise} A Promise that resolves with an ArrayBuffer containing the file's contents. - * @throws {Error} If the file cannot be read. - */ - arrayBuffer(): Promise; - /** - * Reads the contents of the file specified by the filePath property and returns a Promise that - * resolves with a Blob containing the file's contents. - * @returns {Promise} A Promise that resolves with a Blob containing the file's contents. - * @throws {Error} If the file cannot be read. - */ - blob(): Promise; - /** - * Reads the contents of the file specified by the filePath property and returns a Promise that - * resolves with a string containing the file's contents. - * @returns {Promise} A Promise that resolves with a string containing the file's contents. - * @throws {Error} If the file cannot be read. - */ - text(): Promise; - /** - * Reads the contents of the file specified by the filePath property and returns a Promise that - * resolves with a parsed JavaScript object containing the file's contents. - * - * @returns {Promise} A Promise that resolves with a parsed JavaScript object containing the file's contents. - * @throws {Error} If the file cannot be read. - */ - json(): Promise; + /** + * Creates a new `FileResponse` object. + * @param {string|URL} filePath + */ + constructor(filePath: string | URL); + /** + * Mapping from file extensions to MIME types. + */ + _CONTENT_TYPE_MAP: { + txt: string; + html: string; + css: string; + js: string; + json: string; + png: string; + jpg: string; + jpeg: string; + gif: string; + }; + filePath: string | URL; + headers: Headers; + exists: any; + status: number; + statusText: string; + body: ReadableStream; + /** + * Updates the 'content-type' header property of the response based on the extension of + * the file specified by the filePath property of the current object. + * @returns {void} + */ + updateContentType(): void; + /** + * Clone the current FileResponse object. + * @returns {FileResponse} A new FileResponse object with the same properties as the current object. + */ + clone(): FileResponse; + /** + * Reads the contents of the file specified by the filePath property and returns a Promise that + * resolves with an ArrayBuffer containing the file's contents. + * @returns {Promise} A Promise that resolves with an ArrayBuffer containing the file's contents. + * @throws {Error} If the file cannot be read. + */ + arrayBuffer(): Promise; + /** + * Reads the contents of the file specified by the filePath property and returns a Promise that + * resolves with a Blob containing the file's contents. + * @returns {Promise} A Promise that resolves with a Blob containing the file's contents. + * @throws {Error} If the file cannot be read. + */ + blob(): Promise; + /** + * Reads the contents of the file specified by the filePath property and returns a Promise that + * resolves with a string containing the file's contents. + * @returns {Promise} A Promise that resolves with a string containing the file's contents. + * @throws {Error} If the file cannot be read. + */ + text(): Promise; + /** + * Reads the contents of the file specified by the filePath property and returns a Promise that + * resolves with a parsed JavaScript object containing the file's contents. + * + * @returns {Promise} A Promise that resolves with a parsed JavaScript object containing the file's contents. + * @throws {Error} If the file cannot be read. + */ + json(): Promise; } export {}; -//# sourceMappingURL=hub.d.ts.map \ No newline at end of file +//# sourceMappingURL=hub.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/utils/image.d.ts b/core/vendor/modules/@xenova/transformers/types/utils/image.d.ts index cd912a7c7..c2122daee 100644 --- a/core/vendor/modules/@xenova/transformers/types/utils/image.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/utils/image.d.ts @@ -1,111 +1,125 @@ export class RawImage { - /** - * Helper method for reading an image from a variety of input types. - * @param {RawImage|string|URL} input - * @returns The image object. - * - * **Example:** Read image from a URL. - * ```javascript - * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); - * // RawImage { - * // "data": Uint8ClampedArray [ 25, 25, 25, 19, 19, 19, ... ], - * // "width": 800, - * // "height": 533, - * // "channels": 3 - * // } - * ``` - */ - static read(input: RawImage | string | URL): Promise; - /** - * Read an image from a URL or file path. - * @param {string|URL} url The URL or file path to read the image from. - * @returns {Promise} The image object. - */ - static fromURL(url: string | URL): Promise; - /** - * Helper method to create a new Image from a blob. - * @param {Blob} blob The blob to read the image from. - * @returns {Promise} The image object. - */ - static fromBlob(blob: Blob): Promise; - /** - * Helper method to create a new Image from a tensor - * @param {import('./tensor.js').Tensor} tensor - */ - static fromTensor(tensor: import('./tensor.js').Tensor, channel_format?: string): RawImage; - /** - * Create a new `RawImage` object. - * @param {Uint8ClampedArray|Uint8Array} data The pixel data. - * @param {number} width The width of the image. - * @param {number} height The height of the image. - * @param {1|2|3|4} channels The number of channels. - */ - constructor(data: Uint8ClampedArray | Uint8Array, width: number, height: number, channels: 1 | 2 | 3 | 4); - data: Uint8Array | Uint8ClampedArray; - width: number; - height: number; - channels: 2 | 1 | 3 | 4; - /** - * Returns the size of the image (width, height). - * @returns {[number, number]} The size of the image (width, height). - */ - get size(): [number, number]; - /** - * Convert the image to grayscale format. - * @returns {RawImage} `this` to support chaining. - */ - grayscale(): RawImage; - /** - * Convert the image to RGB format. - * @returns {RawImage} `this` to support chaining. - */ - rgb(): RawImage; - /** - * Convert the image to RGBA format. - * @returns {RawImage} `this` to support chaining. - */ - rgba(): RawImage; - /** - * Resize the image to the given dimensions. This method uses the canvas API to perform the resizing. - * @param {number} width The width of the new image. - * @param {number} height The height of the new image. - * @param {Object} options Additional options for resizing. - * @param {0|1|2|3|4|5|string} [options.resample] The resampling method to use. - * @returns {Promise} `this` to support chaining. - */ - resize(width: number, height: number, { resample, }?: { - resample?: 0 | 1 | 2 | 3 | 4 | 5 | string; - }): Promise; - pad([left, right, top, bottom]: [any, any, any, any]): Promise; - crop([x_min, y_min, x_max, y_max]: [any, any, any, any]): Promise; - center_crop(crop_width: any, crop_height: any): Promise; - toBlob(type?: string, quality?: number): Promise; - toCanvas(): any; - /** - * Helper method to update the image data. - * @param {Uint8ClampedArray} data The new image data. - * @param {number} width The new width of the image. - * @param {number} height The new height of the image. - * @param {1|2|3|4|null} [channels] The new number of channels of the image. - * @private - */ - private _update; - /** - * Clone the image - * @returns {RawImage} The cloned image - */ - clone(): RawImage; - /** - * Helper method for converting image to have a certain number of channels - * @param {number} numChannels The number of channels. Must be 1, 3, or 4. - * @returns {RawImage} `this` to support chaining. - */ - convert(numChannels: number): RawImage; - /** - * Save the image to the given path. - * @param {string} path The path to save the image to. - */ - save(path: string): Promise; - toSharp(): any; + /** + * Helper method for reading an image from a variety of input types. + * @param {RawImage|string|URL} input + * @returns The image object. + * + * **Example:** Read image from a URL. + * ```javascript + * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); + * // RawImage { + * // "data": Uint8ClampedArray [ 25, 25, 25, 19, 19, 19, ... ], + * // "width": 800, + * // "height": 533, + * // "channels": 3 + * // } + * ``` + */ + static read(input: RawImage | string | URL): Promise; + /** + * Read an image from a URL or file path. + * @param {string|URL} url The URL or file path to read the image from. + * @returns {Promise} The image object. + */ + static fromURL(url: string | URL): Promise; + /** + * Helper method to create a new Image from a blob. + * @param {Blob} blob The blob to read the image from. + * @returns {Promise} The image object. + */ + static fromBlob(blob: Blob): Promise; + /** + * Helper method to create a new Image from a tensor + * @param {import('./tensor.js').Tensor} tensor + */ + static fromTensor( + tensor: import("./tensor.js").Tensor, + channel_format?: string, + ): RawImage; + /** + * Create a new `RawImage` object. + * @param {Uint8ClampedArray|Uint8Array} data The pixel data. + * @param {number} width The width of the image. + * @param {number} height The height of the image. + * @param {1|2|3|4} channels The number of channels. + */ + constructor( + data: Uint8ClampedArray | Uint8Array, + width: number, + height: number, + channels: 1 | 2 | 3 | 4, + ); + data: Uint8Array | Uint8ClampedArray; + width: number; + height: number; + channels: 2 | 1 | 3 | 4; + /** + * Returns the size of the image (width, height). + * @returns {[number, number]} The size of the image (width, height). + */ + get size(): [number, number]; + /** + * Convert the image to grayscale format. + * @returns {RawImage} `this` to support chaining. + */ + grayscale(): RawImage; + /** + * Convert the image to RGB format. + * @returns {RawImage} `this` to support chaining. + */ + rgb(): RawImage; + /** + * Convert the image to RGBA format. + * @returns {RawImage} `this` to support chaining. + */ + rgba(): RawImage; + /** + * Resize the image to the given dimensions. This method uses the canvas API to perform the resizing. + * @param {number} width The width of the new image. + * @param {number} height The height of the new image. + * @param {Object} options Additional options for resizing. + * @param {0|1|2|3|4|5|string} [options.resample] The resampling method to use. + * @returns {Promise} `this` to support chaining. + */ + resize( + width: number, + height: number, + { + resample, + }?: { + resample?: 0 | 1 | 2 | 3 | 4 | 5 | string; + }, + ): Promise; + pad([left, right, top, bottom]: [any, any, any, any]): Promise; + crop([x_min, y_min, x_max, y_max]: [any, any, any, any]): Promise; + center_crop(crop_width: any, crop_height: any): Promise; + toBlob(type?: string, quality?: number): Promise; + toCanvas(): any; + /** + * Helper method to update the image data. + * @param {Uint8ClampedArray} data The new image data. + * @param {number} width The new width of the image. + * @param {number} height The new height of the image. + * @param {1|2|3|4|null} [channels] The new number of channels of the image. + * @private + */ + private _update; + /** + * Clone the image + * @returns {RawImage} The cloned image + */ + clone(): RawImage; + /** + * Helper method for converting image to have a certain number of channels + * @param {number} numChannels The number of channels. Must be 1, 3, or 4. + * @returns {RawImage} `this` to support chaining. + */ + convert(numChannels: number): RawImage; + /** + * Save the image to the given path. + * @param {string} path The path to save the image to. + */ + save(path: string): Promise; + toSharp(): any; } -//# sourceMappingURL=image.d.ts.map \ No newline at end of file +//# sourceMappingURL=image.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/utils/maths.d.ts b/core/vendor/modules/@xenova/transformers/types/utils/maths.d.ts index 840ce70da..01840baf4 100644 --- a/core/vendor/modules/@xenova/transformers/types/utils/maths.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/utils/maths.d.ts @@ -14,7 +14,13 @@ /** * @param {TypedArray} input */ -export function interpolate_data(input: TypedArray, [in_channels, in_height, in_width]: [any, any, any], [out_height, out_width]: [any, any], mode?: string, align_corners?: boolean): any; +export function interpolate_data( + input: TypedArray, + [in_channels, in_height, in_width]: [any, any, any], + [out_height, out_width]: [any, any], + mode?: string, + align_corners?: boolean, +): any; /** * Helper method to transpose a `AnyTypedArray` directly * @template {AnyTypedArray} T @@ -23,7 +29,11 @@ export function interpolate_data(input: TypedArray, [in_channels, in_height, in_ * @param {number[]} axes * @returns {[T, number[]]} The transposed array and the new shape. */ -export function transpose_data(array: T, dims: number[], axes: number[]): [T, number[]]; +export function transpose_data( + array: T, + dims: number[], + axes: number[], +): [T, number[]]; /** * Compute the softmax of an array of numbers. * @template {TypedArray|number[]} T @@ -51,7 +61,10 @@ export function dot(arr1: number[], arr2: number[]): number; * @param {number|null} [top_k=0] The number of top items to return (default: 0 = return all) * @returns {[number, any][]} The top k items, sorted by descending order */ -export function getTopItems(items: any[] | TypedArray, top_k?: number | null): [number, any][]; +export function getTopItems( + items: any[] | TypedArray, + top_k?: number | null, +): [number, any][]; /** * Computes the cosine similarity between two arrays. * @@ -94,15 +107,24 @@ export function medianFilter(data: AnyTypedArray, windowSize: number): any; */ export function round(num: number, decimals: number): number; export class FFT { - constructor(fft_length: any); - fft_length: any; - isPowerOfTwo: boolean; - fft: P2FFT | NP2FFT; - outputBufferSize: number; - realTransform(out: any, input: any): void; - transform(out: any, input: any): void; + constructor(fft_length: any); + fft_length: any; + isPowerOfTwo: boolean; + fft: P2FFT | NP2FFT; + outputBufferSize: number; + realTransform(out: any, input: any): void; + transform(out: any, input: any): void; } -export type TypedArray = Int8Array | Uint8Array | Uint8ClampedArray | Int16Array | Uint16Array | Int32Array | Uint32Array | Float32Array | Float64Array; +export type TypedArray = + | Int8Array + | Uint8Array + | Uint8ClampedArray + | Int16Array + | Uint16Array + | Int32Array + | Uint32Array + | Float32Array + | Float64Array; export type BigTypedArray = BigInt64Array | BigUint64Array; export type AnyTypedArray = TypedArray | BigTypedArray; /** @@ -113,140 +135,166 @@ export type AnyTypedArray = TypedArray | BigTypedArray; * Code adapted from https://www.npmjs.com/package/fft.js */ declare class P2FFT { - /** - * @param {number} size The size of the input array. Must be a power of two larger than 1. - * @throws {Error} FFT size must be a power of two larger than 1. - */ - constructor(size: number); - size: number; - _csize: number; - table: Float64Array; - _width: number; - _bitrev: Int32Array; - /** - * Create a complex number array with size `2 * size` - * - * @returns {Float64Array} A complex number array with size `2 * size` - */ - createComplexArray(): Float64Array; - /** - * Converts a complex number representation stored in a Float64Array to an array of real numbers. - * - * @param {Float64Array} complex The complex number representation to be converted. - * @param {number[]} [storage] An optional array to store the result in. - * @returns {number[]} An array of real numbers representing the input complex number representation. - */ - fromComplexArray(complex: Float64Array, storage?: number[]): number[]; - /** - * Convert a real-valued input array to a complex-valued output array. - * @param {Float64Array} input The real-valued input array. - * @param {Float64Array} [storage] Optional buffer to store the output array. - * @returns {Float64Array} The complex-valued output array. - */ - toComplexArray(input: Float64Array, storage?: Float64Array): Float64Array; - /** - * Completes the spectrum by adding its mirrored negative frequency components. - * @param {Float64Array} spectrum The input spectrum. - * @returns {void} - */ - completeSpectrum(spectrum: Float64Array): void; - /** - * Performs a Fast Fourier Transform (FFT) on the given input data and stores the result in the output buffer. - * - * @param {Float64Array} out The output buffer to store the result. - * @param {Float64Array} data The input data to transform. - * - * @throws {Error} Input and output buffers must be different. - * - * @returns {void} - */ - transform(out: Float64Array, data: Float64Array): void; - /** - * Performs a real-valued forward FFT on the given input buffer and stores the result in the given output buffer. - * The input buffer must contain real values only, while the output buffer will contain complex values. The input and - * output buffers must be different. - * - * @param {Float64Array} out The output buffer. - * @param {Float64Array} data The input buffer containing real values. - * - * @throws {Error} If the input and output buffers are the same. - */ - realTransform(out: Float64Array, data: Float64Array): void; - /** - * Performs an inverse FFT transformation on the given `data` array, and stores the result in `out`. - * The `out` array must be a different buffer than the `data` array. The `out` array will contain the - * result of the transformation. The `data` array will not be modified. - * - * @param {Float64Array} out The output buffer for the transformed data. - * @param {Float64Array} data The input data to transform. - * @throws {Error} If `out` and `data` refer to the same buffer. - * @returns {void} - */ - inverseTransform(out: Float64Array, data: Float64Array): void; - /** - * Performs a radix-4 implementation of a discrete Fourier transform on a given set of data. - * - * @param {Float64Array} out The output buffer for the transformed data. - * @param {Float64Array} data The input buffer of data to be transformed. - * @param {number} inv A scaling factor to apply to the transform. - * @returns {void} - */ - _transform4(out: Float64Array, data: Float64Array, inv: number): void; - /** - * Performs a radix-2 implementation of a discrete Fourier transform on a given set of data. - * - * @param {Float64Array} data The input buffer of data to be transformed. - * @param {Float64Array} out The output buffer for the transformed data. - * @param {number} outOff The offset at which to write the output data. - * @param {number} off The offset at which to begin reading the input data. - * @param {number} step The step size for indexing the input data. - * @returns {void} - */ - _singleTransform2(data: Float64Array, out: Float64Array, outOff: number, off: number, step: number): void; - /** - * Performs radix-4 transformation on input data of length 8 - * - * @param {Float64Array} data Input data array of length 8 - * @param {Float64Array} out Output data array of length 8 - * @param {number} outOff Index of output array to start writing from - * @param {number} off Index of input array to start reading from - * @param {number} step Step size between elements in input array - * @param {number} inv Scaling factor for inverse transform - * - * @returns {void} - */ - _singleTransform4(data: Float64Array, out: Float64Array, outOff: number, off: number, step: number, inv: number): void; - /** - * Real input radix-4 implementation - * @param {Float64Array} out Output array for the transformed data - * @param {Float64Array} data Input array of real data to be transformed - * @param {number} inv The scale factor used to normalize the inverse transform - */ - _realTransform4(out: Float64Array, data: Float64Array, inv: number): void; - /** - * Performs a single real input radix-2 transformation on the provided data - * - * @param {Float64Array} data The input data array - * @param {Float64Array} out The output data array - * @param {number} outOff The output offset - * @param {number} off The input offset - * @param {number} step The step - * - * @returns {void} - */ - _singleRealTransform2(data: Float64Array, out: Float64Array, outOff: number, off: number, step: number): void; - /** - * Computes a single real-valued transform using radix-4 algorithm. - * This method is only called for len=8. - * - * @param {Float64Array} data The input data array. - * @param {Float64Array} out The output data array. - * @param {number} outOff The offset into the output array. - * @param {number} off The offset into the input array. - * @param {number} step The step size for the input array. - * @param {number} inv The value of inverse. - */ - _singleRealTransform4(data: Float64Array, out: Float64Array, outOff: number, off: number, step: number, inv: number): void; + /** + * @param {number} size The size of the input array. Must be a power of two larger than 1. + * @throws {Error} FFT size must be a power of two larger than 1. + */ + constructor(size: number); + size: number; + _csize: number; + table: Float64Array; + _width: number; + _bitrev: Int32Array; + /** + * Create a complex number array with size `2 * size` + * + * @returns {Float64Array} A complex number array with size `2 * size` + */ + createComplexArray(): Float64Array; + /** + * Converts a complex number representation stored in a Float64Array to an array of real numbers. + * + * @param {Float64Array} complex The complex number representation to be converted. + * @param {number[]} [storage] An optional array to store the result in. + * @returns {number[]} An array of real numbers representing the input complex number representation. + */ + fromComplexArray(complex: Float64Array, storage?: number[]): number[]; + /** + * Convert a real-valued input array to a complex-valued output array. + * @param {Float64Array} input The real-valued input array. + * @param {Float64Array} [storage] Optional buffer to store the output array. + * @returns {Float64Array} The complex-valued output array. + */ + toComplexArray(input: Float64Array, storage?: Float64Array): Float64Array; + /** + * Completes the spectrum by adding its mirrored negative frequency components. + * @param {Float64Array} spectrum The input spectrum. + * @returns {void} + */ + completeSpectrum(spectrum: Float64Array): void; + /** + * Performs a Fast Fourier Transform (FFT) on the given input data and stores the result in the output buffer. + * + * @param {Float64Array} out The output buffer to store the result. + * @param {Float64Array} data The input data to transform. + * + * @throws {Error} Input and output buffers must be different. + * + * @returns {void} + */ + transform(out: Float64Array, data: Float64Array): void; + /** + * Performs a real-valued forward FFT on the given input buffer and stores the result in the given output buffer. + * The input buffer must contain real values only, while the output buffer will contain complex values. The input and + * output buffers must be different. + * + * @param {Float64Array} out The output buffer. + * @param {Float64Array} data The input buffer containing real values. + * + * @throws {Error} If the input and output buffers are the same. + */ + realTransform(out: Float64Array, data: Float64Array): void; + /** + * Performs an inverse FFT transformation on the given `data` array, and stores the result in `out`. + * The `out` array must be a different buffer than the `data` array. The `out` array will contain the + * result of the transformation. The `data` array will not be modified. + * + * @param {Float64Array} out The output buffer for the transformed data. + * @param {Float64Array} data The input data to transform. + * @throws {Error} If `out` and `data` refer to the same buffer. + * @returns {void} + */ + inverseTransform(out: Float64Array, data: Float64Array): void; + /** + * Performs a radix-4 implementation of a discrete Fourier transform on a given set of data. + * + * @param {Float64Array} out The output buffer for the transformed data. + * @param {Float64Array} data The input buffer of data to be transformed. + * @param {number} inv A scaling factor to apply to the transform. + * @returns {void} + */ + _transform4(out: Float64Array, data: Float64Array, inv: number): void; + /** + * Performs a radix-2 implementation of a discrete Fourier transform on a given set of data. + * + * @param {Float64Array} data The input buffer of data to be transformed. + * @param {Float64Array} out The output buffer for the transformed data. + * @param {number} outOff The offset at which to write the output data. + * @param {number} off The offset at which to begin reading the input data. + * @param {number} step The step size for indexing the input data. + * @returns {void} + */ + _singleTransform2( + data: Float64Array, + out: Float64Array, + outOff: number, + off: number, + step: number, + ): void; + /** + * Performs radix-4 transformation on input data of length 8 + * + * @param {Float64Array} data Input data array of length 8 + * @param {Float64Array} out Output data array of length 8 + * @param {number} outOff Index of output array to start writing from + * @param {number} off Index of input array to start reading from + * @param {number} step Step size between elements in input array + * @param {number} inv Scaling factor for inverse transform + * + * @returns {void} + */ + _singleTransform4( + data: Float64Array, + out: Float64Array, + outOff: number, + off: number, + step: number, + inv: number, + ): void; + /** + * Real input radix-4 implementation + * @param {Float64Array} out Output array for the transformed data + * @param {Float64Array} data Input array of real data to be transformed + * @param {number} inv The scale factor used to normalize the inverse transform + */ + _realTransform4(out: Float64Array, data: Float64Array, inv: number): void; + /** + * Performs a single real input radix-2 transformation on the provided data + * + * @param {Float64Array} data The input data array + * @param {Float64Array} out The output data array + * @param {number} outOff The output offset + * @param {number} off The input offset + * @param {number} step The step + * + * @returns {void} + */ + _singleRealTransform2( + data: Float64Array, + out: Float64Array, + outOff: number, + off: number, + step: number, + ): void; + /** + * Computes a single real-valued transform using radix-4 algorithm. + * This method is only called for len=8. + * + * @param {Float64Array} data The input data array. + * @param {Float64Array} out The output data array. + * @param {number} outOff The offset into the output array. + * @param {number} off The offset into the input array. + * @param {number} step The step size for the input array. + * @param {number} inv The value of inverse. + */ + _singleRealTransform4( + data: Float64Array, + out: Float64Array, + outOff: number, + off: number, + step: number, + inv: number, + ): void; } /** * NP2FFT class provides functionality for performing Fast Fourier Transform on arrays @@ -255,23 +303,23 @@ declare class P2FFT { * For more information, see: https://math.stackexchange.com/questions/77118/non-power-of-2-ffts/77156#77156 */ declare class NP2FFT { - /** - * Constructs a new NP2FFT object. - * @param {number} fft_length The length of the FFT - */ - constructor(fft_length: number); - bufferSize: number; - _a: number; - _chirpBuffer: Float64Array; - _buffer1: Float64Array; - _buffer2: Float64Array; - _outBuffer1: Float64Array; - _outBuffer2: Float64Array; - _slicedChirpBuffer: Float64Array; - _f: P2FFT; - _transform(output: any, input: any, real: any): void; - transform(output: any, input: any): void; - realTransform(output: any, input: any): void; + /** + * Constructs a new NP2FFT object. + * @param {number} fft_length The length of the FFT + */ + constructor(fft_length: number); + bufferSize: number; + _a: number; + _chirpBuffer: Float64Array; + _buffer1: Float64Array; + _buffer2: Float64Array; + _outBuffer1: Float64Array; + _outBuffer2: Float64Array; + _slicedChirpBuffer: Float64Array; + _f: P2FFT; + _transform(output: any, input: any, real: any): void; + transform(output: any, input: any): void; + realTransform(output: any, input: any): void; } export {}; -//# sourceMappingURL=maths.d.ts.map \ No newline at end of file +//# sourceMappingURL=maths.d.ts.map diff --git a/core/vendor/modules/@xenova/transformers/types/utils/tensor.d.ts b/core/vendor/modules/@xenova/transformers/types/utils/tensor.d.ts index da2650e42..f250b1e00 100644 --- a/core/vendor/modules/@xenova/transformers/types/utils/tensor.d.ts +++ b/core/vendor/modules/@xenova/transformers/types/utils/tensor.d.ts @@ -13,14 +13,22 @@ export function transpose(tensor: any, axes: any[]): Tensor; * @param {boolean} align_corners Whether to align corners. * @returns {Tensor} The interpolated tensor. */ -export function interpolate(input: Tensor, [out_height, out_width]: number[], mode?: string, align_corners?: boolean): Tensor; +export function interpolate( + input: Tensor, + [out_height, out_width]: number[], + mode?: string, + align_corners?: boolean, +): Tensor; /** * Perform mean pooling of the last hidden state followed by a normalization step. * @param {Tensor} last_hidden_state Tensor of shape [batchSize, seqLength, embedDim] * @param {Tensor} attention_mask Tensor of shape [batchSize, seqLength] * @returns {Tensor} Returns a new Tensor of shape [batchSize, embedDim]. */ -export function mean_pooling(last_hidden_state: Tensor, attention_mask: Tensor): Tensor; +export function mean_pooling( + last_hidden_state: Tensor, + attention_mask: Tensor, +): Tensor; /** * Concatenates an array of tensors along a specified dimension. * @param {Tensor[]} tensors The array of tensors to concatenate. @@ -43,7 +51,12 @@ export function stack(tensors: Tensor[], dim?: number): Tensor; * @param {boolean} keepdim whether the output tensor has dim retained or not. * @returns {Tensor[]} A tuple of (std, mean) tensors. */ -export function std_mean(input: Tensor, dim?: number | null, correction?: number, keepdim?: boolean): Tensor[]; +export function std_mean( + input: Tensor, + dim?: number | null, + correction?: number, + keepdim?: boolean, +): Tensor[]; /** * Returns the mean value of each row of the input tensor in the given dimension dim. * @param {Tensor} input the input tensor. @@ -51,7 +64,11 @@ export function std_mean(input: Tensor, dim?: number | null, correction?: number * @param {boolean} keepdim whether the output tensor has dim retained or not. * @returns A new tensor with means taken along the specified dimension. */ -export function mean(input: Tensor, dim?: number | null, keepdim?: boolean): Tensor; +export function mean( + input: Tensor, + dim?: number | null, + keepdim?: boolean, +): Tensor; /** * * Measures similarity between two temporal sequences (e.g., input audio and output tokens @@ -72,228 +89,236 @@ export function ones(size: number[]): Tensor; */ export function ones_like(tensor: Tensor): Tensor; export class Tensor { - /** - * Create a new Tensor or copy an existing Tensor. - * @param {[DataType, DataArray, number[]]|[import('onnxruntime-common').Tensor]} args - */ - constructor(...args: [DataType, DataArray, number[]] | [import('onnxruntime-common').Tensor]); - /** @type {number[]} Dimensions of the tensor. */ - dims: number[]; - /** @type {DataType} Type of the tensor. */ - type: DataType; - /** @type {DataArray} The data stored in the tensor. */ - data: DataArray; - /** @type {number} The number of elements in the tensor. */ - size: number; - /** - * Index into a Tensor object. - * @param {number} index The index to access. - * @returns {Tensor} The data at the specified index. - */ - _getitem(index: number): Tensor; - /** - * @param {number|bigint} item The item to search for in the tensor - * @returns {number} The index of the first occurrence of item in the tensor data. - */ - indexOf(item: number | bigint): number; - /** - * @param {number} index - * @param {number} iterSize - * @param {any} iterDims - * @returns {Tensor} - */ - _subarray(index: number, iterSize: number, iterDims: any): Tensor; - /** - * Returns the value of this tensor as a standard JavaScript Number. This only works - * for tensors with one element. For other cases, see `Tensor.tolist()`. - * @returns {number|bigint} The value of this tensor as a standard JavaScript Number. - * @throws {Error} If the tensor has more than one element. - */ - item(): number | bigint; - /** - * Convert tensor data to a n-dimensional JS list - * @returns {Array} - */ - tolist(): any[]; - /** - * Return a new Tensor with the sigmoid function applied to each element. - * @returns {Tensor} The tensor with the sigmoid function applied. - */ - sigmoid(): Tensor; - /** - * Applies the sigmoid function to the tensor in place. - * @returns {Tensor} Returns `this`. - */ - sigmoid_(): Tensor; - /** - * Return a new Tensor with every element multiplied by a constant. - * @param {number} val The value to multiply by. - * @returns {Tensor} The new tensor. - */ - mul(val: number): Tensor; - /** - * Multiply the tensor by a constant in place. - * @param {number} val The value to multiply by. - * @returns {Tensor} Returns `this`. - */ - mul_(val: number): Tensor; - /** - * Return a new Tensor with every element added by a constant. - * @param {number} val The value to add by. - * @returns {Tensor} The new tensor. - */ - add(val: number): Tensor; - /** - * Add the tensor by a constant in place. - * @param {number} val The value to add by. - * @returns {Tensor} Returns `this`. - */ - add_(val: number): Tensor; - clone(): Tensor; - slice(...slices: any[]): Tensor; - /** - * Return a transposed version of this Tensor, according to the provided dimensions. - * @param {...number} dims Dimensions to transpose. - * @returns {Tensor} The transposed tensor. - */ - transpose(...dims: number[]): Tensor; - /** - * Returns the sum of each row of the input tensor in the given dimension dim. - * - * @param {number} [dim=null] The dimension or dimensions to reduce. If `null`, all dimensions are reduced. - * @param {boolean} keepdim Whether the output tensor has `dim` retained or not. - * @returns The summed tensor - */ - sum(dim?: number, keepdim?: boolean): Tensor; - /** - * Returns the matrix norm or vector norm of a given tensor. - * @param {number|string} [p='fro'] The order of norm - * @param {number} [dim=null] Specifies which dimension of the tensor to calculate the norm across. - * If dim is None, the norm will be calculated across all dimensions of input. - * @param {boolean} [keepdim=false] Whether the output tensors have dim retained or not. - * @returns {Tensor} The norm of the tensor. - */ - norm(p?: number | string, dim?: number, keepdim?: boolean): Tensor; - /** - * Performs `L_p` normalization of inputs over specified dimension. Operates in place. - * @param {number} [p=2] The exponent value in the norm formulation - * @param {number} [dim=1] The dimension to reduce - * @returns {Tensor} `this` for operation chaining. - */ - normalize_(p?: number, dim?: number): Tensor; - /** - * Performs `L_p` normalization of inputs over specified dimension. - * @param {number} [p=2] The exponent value in the norm formulation - * @param {number} [dim=1] The dimension to reduce - * @returns {Tensor} The normalized tensor. - */ - normalize(p?: number, dim?: number): Tensor; - /** - * Compute and return the stride of this tensor. - * Stride is the jump necessary to go from one element to the next one in the specified dimension dim. - * @returns {number[]} The stride of this tensor. - */ - stride(): number[]; - /** - * Returns a tensor with all specified dimensions of input of size 1 removed. - * - * NOTE: The returned tensor shares the storage with the input tensor, so changing the contents of one will change the contents of the other. - * If you would like a copy, use `tensor.clone()` before squeezing. - * - * @param {number} [dim=null] If given, the input will be squeezed only in the specified dimensions. - * @returns The squeezed tensor - */ - squeeze(dim?: number): Tensor; - /** - * In-place version of @see {@link Tensor.squeeze} - */ - squeeze_(dim?: any): this; - /** - * Returns a new tensor with a dimension of size one inserted at the specified position. - * - * NOTE: The returned tensor shares the same underlying data with this tensor. - * - * @param {number} dim The index at which to insert the singleton dimension - * @returns The unsqueezed tensor - */ - unsqueeze(dim?: number): Tensor; - /** - * In-place version of @see {@link Tensor.unsqueeze} - */ - unsqueeze_(dim?: any): this; - /** - * In-place version of @see {@link Tensor.flatten} - */ - flatten_(start_dim?: number, end_dim?: number): this; - /** - * Flattens input by reshaping it into a one-dimensional tensor. - * If `start_dim` or `end_dim` are passed, only dimensions starting with `start_dim` - * and ending with `end_dim` are flattened. The order of elements in input is unchanged. - * @param {number} start_dim the first dim to flatten - * @param {number} end_dim the last dim to flatten - * @returns The flattened tensor. - */ - flatten(start_dim?: number, end_dim?: number): Tensor; - /** - * Returns a new tensor with the same data as the `self` tensor but of a different `shape`. - * @param {...number} dims the desired size - * @returns {Tensor} The tensor with the same data but different shape - */ - view(...dims: number[]): Tensor; - neg_(): this; - neg(): Tensor; - /** - * In-place version of @see {@link Tensor.clamp} - */ - clamp_(min: any, max: any): this; - /** - * Clamps all elements in input into the range [ min, max ] - * @param {number} min lower-bound of the range to be clamped to - * @param {number} max upper-bound of the range to be clamped to - * @returns the output tensor. - */ - clamp(min: number, max: number): Tensor; - /** - * In-place version of @see {@link Tensor.round} - */ - round_(): this; - /** - * Rounds elements of input to the nearest integer. - * @returns the output tensor. - */ - round(): Tensor; - /** - * Performs Tensor dtype conversion. - * @param {DataType} type The desired data type. - * @returns {Tensor} The converted tensor. - */ - to(type: DataType): Tensor; - /** - * Returns an iterator object for iterating over the tensor data in row-major order. - * If the tensor has more than one dimension, the iterator will yield subarrays. - * @returns {Iterator} An iterator object for iterating over the tensor data in row-major order. - */ - [Symbol.iterator](): Iterator; + /** + * Create a new Tensor or copy an existing Tensor. + * @param {[DataType, DataArray, number[]]|[import('onnxruntime-common').Tensor]} args + */ + constructor( + ...args: + | [DataType, DataArray, number[]] + | [import("onnxruntime-common").Tensor] + ); + /** @type {number[]} Dimensions of the tensor. */ + dims: number[]; + /** @type {DataType} Type of the tensor. */ + type: DataType; + /** @type {DataArray} The data stored in the tensor. */ + data: DataArray; + /** @type {number} The number of elements in the tensor. */ + size: number; + /** + * Index into a Tensor object. + * @param {number} index The index to access. + * @returns {Tensor} The data at the specified index. + */ + _getitem(index: number): Tensor; + /** + * @param {number|bigint} item The item to search for in the tensor + * @returns {number} The index of the first occurrence of item in the tensor data. + */ + indexOf(item: number | bigint): number; + /** + * @param {number} index + * @param {number} iterSize + * @param {any} iterDims + * @returns {Tensor} + */ + _subarray(index: number, iterSize: number, iterDims: any): Tensor; + /** + * Returns the value of this tensor as a standard JavaScript Number. This only works + * for tensors with one element. For other cases, see `Tensor.tolist()`. + * @returns {number|bigint} The value of this tensor as a standard JavaScript Number. + * @throws {Error} If the tensor has more than one element. + */ + item(): number | bigint; + /** + * Convert tensor data to a n-dimensional JS list + * @returns {Array} + */ + tolist(): any[]; + /** + * Return a new Tensor with the sigmoid function applied to each element. + * @returns {Tensor} The tensor with the sigmoid function applied. + */ + sigmoid(): Tensor; + /** + * Applies the sigmoid function to the tensor in place. + * @returns {Tensor} Returns `this`. + */ + sigmoid_(): Tensor; + /** + * Return a new Tensor with every element multiplied by a constant. + * @param {number} val The value to multiply by. + * @returns {Tensor} The new tensor. + */ + mul(val: number): Tensor; + /** + * Multiply the tensor by a constant in place. + * @param {number} val The value to multiply by. + * @returns {Tensor} Returns `this`. + */ + mul_(val: number): Tensor; + /** + * Return a new Tensor with every element added by a constant. + * @param {number} val The value to add by. + * @returns {Tensor} The new tensor. + */ + add(val: number): Tensor; + /** + * Add the tensor by a constant in place. + * @param {number} val The value to add by. + * @returns {Tensor} Returns `this`. + */ + add_(val: number): Tensor; + clone(): Tensor; + slice(...slices: any[]): Tensor; + /** + * Return a transposed version of this Tensor, according to the provided dimensions. + * @param {...number} dims Dimensions to transpose. + * @returns {Tensor} The transposed tensor. + */ + transpose(...dims: number[]): Tensor; + /** + * Returns the sum of each row of the input tensor in the given dimension dim. + * + * @param {number} [dim=null] The dimension or dimensions to reduce. If `null`, all dimensions are reduced. + * @param {boolean} keepdim Whether the output tensor has `dim` retained or not. + * @returns The summed tensor + */ + sum(dim?: number, keepdim?: boolean): Tensor; + /** + * Returns the matrix norm or vector norm of a given tensor. + * @param {number|string} [p='fro'] The order of norm + * @param {number} [dim=null] Specifies which dimension of the tensor to calculate the norm across. + * If dim is None, the norm will be calculated across all dimensions of input. + * @param {boolean} [keepdim=false] Whether the output tensors have dim retained or not. + * @returns {Tensor} The norm of the tensor. + */ + norm(p?: number | string, dim?: number, keepdim?: boolean): Tensor; + /** + * Performs `L_p` normalization of inputs over specified dimension. Operates in place. + * @param {number} [p=2] The exponent value in the norm formulation + * @param {number} [dim=1] The dimension to reduce + * @returns {Tensor} `this` for operation chaining. + */ + normalize_(p?: number, dim?: number): Tensor; + /** + * Performs `L_p` normalization of inputs over specified dimension. + * @param {number} [p=2] The exponent value in the norm formulation + * @param {number} [dim=1] The dimension to reduce + * @returns {Tensor} The normalized tensor. + */ + normalize(p?: number, dim?: number): Tensor; + /** + * Compute and return the stride of this tensor. + * Stride is the jump necessary to go from one element to the next one in the specified dimension dim. + * @returns {number[]} The stride of this tensor. + */ + stride(): number[]; + /** + * Returns a tensor with all specified dimensions of input of size 1 removed. + * + * NOTE: The returned tensor shares the storage with the input tensor, so changing the contents of one will change the contents of the other. + * If you would like a copy, use `tensor.clone()` before squeezing. + * + * @param {number} [dim=null] If given, the input will be squeezed only in the specified dimensions. + * @returns The squeezed tensor + */ + squeeze(dim?: number): Tensor; + /** + * In-place version of @see {@link Tensor.squeeze} + */ + squeeze_(dim?: any): this; + /** + * Returns a new tensor with a dimension of size one inserted at the specified position. + * + * NOTE: The returned tensor shares the same underlying data with this tensor. + * + * @param {number} dim The index at which to insert the singleton dimension + * @returns The unsqueezed tensor + */ + unsqueeze(dim?: number): Tensor; + /** + * In-place version of @see {@link Tensor.unsqueeze} + */ + unsqueeze_(dim?: any): this; + /** + * In-place version of @see {@link Tensor.flatten} + */ + flatten_(start_dim?: number, end_dim?: number): this; + /** + * Flattens input by reshaping it into a one-dimensional tensor. + * If `start_dim` or `end_dim` are passed, only dimensions starting with `start_dim` + * and ending with `end_dim` are flattened. The order of elements in input is unchanged. + * @param {number} start_dim the first dim to flatten + * @param {number} end_dim the last dim to flatten + * @returns The flattened tensor. + */ + flatten(start_dim?: number, end_dim?: number): Tensor; + /** + * Returns a new tensor with the same data as the `self` tensor but of a different `shape`. + * @param {...number} dims the desired size + * @returns {Tensor} The tensor with the same data but different shape + */ + view(...dims: number[]): Tensor; + neg_(): this; + neg(): Tensor; + /** + * In-place version of @see {@link Tensor.clamp} + */ + clamp_(min: any, max: any): this; + /** + * Clamps all elements in input into the range [ min, max ] + * @param {number} min lower-bound of the range to be clamped to + * @param {number} max upper-bound of the range to be clamped to + * @returns the output tensor. + */ + clamp(min: number, max: number): Tensor; + /** + * In-place version of @see {@link Tensor.round} + */ + round_(): this; + /** + * Rounds elements of input to the nearest integer. + * @returns the output tensor. + */ + round(): Tensor; + /** + * Performs Tensor dtype conversion. + * @param {DataType} type The desired data type. + * @returns {Tensor} The converted tensor. + */ + to(type: DataType): Tensor; + /** + * Returns an iterator object for iterating over the tensor data in row-major order. + * If the tensor has more than one dimension, the iterator will yield subarrays. + * @returns {Iterator} An iterator object for iterating over the tensor data in row-major order. + */ + [Symbol.iterator](): Iterator; } /** * This creates a nested array of a given type and depth (see examples). */ -export type NestArray = Acc['length'] extends Depth ? T : NestArray; +export type NestArray< + T, + Depth extends number, + Acc extends never[] = [], +> = Acc["length"] extends Depth ? T : NestArray; export type DataType = keyof typeof DataTypeMap; -export type DataArray = import('./maths.js').AnyTypedArray | any[]; +export type DataArray = import("./maths.js").AnyTypedArray | any[]; declare const DataTypeMap: Readonly<{ - float32: Float32ArrayConstructor; - float64: Float64ArrayConstructor; - string: ArrayConstructor; - int8: Int8ArrayConstructor; - uint8: Uint8ArrayConstructor; - int16: Int16ArrayConstructor; - uint16: Uint16ArrayConstructor; - int32: Int32ArrayConstructor; - uint32: Uint32ArrayConstructor; - int64: BigInt64ArrayConstructor; - uint64: BigUint64ArrayConstructor; - bool: Uint8ArrayConstructor; + float32: Float32ArrayConstructor; + float64: Float64ArrayConstructor; + string: ArrayConstructor; + int8: Int8ArrayConstructor; + uint8: Uint8ArrayConstructor; + int16: Int16ArrayConstructor; + uint16: Uint16ArrayConstructor; + int32: Int32ArrayConstructor; + uint32: Uint32ArrayConstructor; + int64: BigInt64ArrayConstructor; + uint64: BigUint64ArrayConstructor; + bool: Uint8ArrayConstructor; }>; export {}; -//# sourceMappingURL=tensor.d.ts.map \ No newline at end of file +//# sourceMappingURL=tensor.d.ts.map diff --git a/docs/docs/customize/changelog.md b/docs/docs/customize/changelog.md index ba501267a..64ddc0621 100644 --- a/docs/docs/customize/changelog.md +++ b/docs/docs/customize/changelog.md @@ -10,4 +10,4 @@ If you are not redirected automatically, follow this [link to the changelog](htt - \ No newline at end of file + diff --git a/extensions/vscode/e2e/tests/TODO.md b/extensions/vscode/e2e/tests/TODO.md index ce2be2ac0..3830d92de 100644 --- a/extensions/vscode/e2e/tests/TODO.md +++ b/extensions/vscode/e2e/tests/TODO.md @@ -1,3 +1,3 @@ TODO testing scenarios -- Highlighting code and pressing CMD+L _before_ the GUI was first opened. The highlighted code should be included as chat input. \ No newline at end of file +- Highlighting code and pressing CMD+L _before_ the GUI was first opened. The highlighted code should be included as chat input. diff --git a/extensions/vscode/models/all-MiniLM-L6-v2/README.md b/extensions/vscode/models/all-MiniLM-L6-v2/README.md index 9f0f484c4..2fe7dd1dd 100644 --- a/extensions/vscode/models/all-MiniLM-L6-v2/README.md +++ b/extensions/vscode/models/all-MiniLM-L6-v2/README.md @@ -7,6 +7,7 @@ https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 with ONNX weights ## Usage (Transformers.js) If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using: + ```bash npm i @xenova/transformers ``` @@ -14,14 +15,17 @@ npm i @xenova/transformers You can then use the model to compute embeddings like this: ```js -import { pipeline } from '@xenova/transformers'; +import { pipeline } from "@xenova/transformers"; // Create a feature-extraction pipeline -const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2'); +const extractor = await pipeline( + "feature-extraction", + "Xenova/all-MiniLM-L6-v2", +); // Compute sentence embeddings -const sentences = ['This is an example sentence', 'Each sentence is converted']; -const output = await extractor(sentences, { pooling: 'mean', normalize: true }); +const sentences = ["This is an example sentence", "Each sentence is converted"]; +const output = await extractor(sentences, { pooling: "mean", normalize: true }); console.log(output); // Tensor { // dims: [ 2, 384 ], @@ -32,6 +36,7 @@ console.log(output); ``` You can convert this Tensor to a nested JavaScript array using `.tolist()`: + ```js console.log(output.tolist()); // [ @@ -40,5 +45,4 @@ console.log(output.tolist()); // ] ``` - -Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [๐Ÿค— Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`). \ No newline at end of file +Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [๐Ÿค— Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`). diff --git a/extensions/vscode/models/all-MiniLM-L6-v2/tokenizer.json b/extensions/vscode/models/all-MiniLM-L6-v2/tokenizer.json index c17ed520e..619288fb6 100644 --- a/extensions/vscode/models/all-MiniLM-L6-v2/tokenizer.json +++ b/extensions/vscode/models/all-MiniLM-L6-v2/tokenizer.json @@ -130,21 +130,13 @@ "special_tokens": { "[CLS]": { "id": "[CLS]", - "ids": [ - 101 - ], - "tokens": [ - "[CLS]" - ] + "ids": [101], + "tokens": ["[CLS]"] }, "[SEP]": { "id": "[SEP]", - "ids": [ - 102 - ], - "tokens": [ - "[SEP]" - ] + "ids": [102], + "tokens": ["[SEP]"] } } }, @@ -30683,4 +30675,4 @@ "##๏ฝž": 30521 } } -} \ No newline at end of file +} diff --git a/extensions/vscode/src/extension/VsCodeMessenger.ts b/extensions/vscode/src/extension/VsCodeMessenger.ts index ed2c143f7..b0c0a4d7b 100644 --- a/extensions/vscode/src/extension/VsCodeMessenger.ts +++ b/extensions/vscode/src/extension/VsCodeMessenger.ts @@ -17,7 +17,6 @@ import { import { stripImages } from "core/util/messageContent"; import * as vscode from "vscode"; - import { ApplyManager } from "../apply"; import { VerticalDiffManager } from "../diff/vertical/manager"; import { addCurrentSelectionToEdit } from "../quickEdit/AddCurrentSelection"; diff --git a/extensions/vscode/src/quickEdit/EditDecorationManager.ts b/extensions/vscode/src/quickEdit/EditDecorationManager.ts index 978e7824f..5099277bc 100644 --- a/extensions/vscode/src/quickEdit/EditDecorationManager.ts +++ b/extensions/vscode/src/quickEdit/EditDecorationManager.ts @@ -45,12 +45,16 @@ class EditDecorationManager { const rangesToPrune: string[] = []; for (const [key, existingRange] of this.activeRangesMap.entries()) { - if (!this.rangesCoincide(mergedRange, existingRange)) {continue;} + if (!this.rangesCoincide(mergedRange, existingRange)) { + continue; + } mergedRange = mergedRange.union(existingRange); rangesToPrune.push(key); } - for (const key of rangesToPrune) {this.activeRangesMap.delete(key);} + for (const key of rangesToPrune) { + this.activeRangesMap.delete(key); + } this.activeRangesMap.set(this.rangeToString(mergedRange), mergedRange); } @@ -61,10 +65,14 @@ class EditDecorationManager { } this._lastEditor = editor; - for (const range of ranges) {this.mergeNewRange(range);} + for (const range of ranges) { + this.mergeNewRange(range); + } const activeRanges = Array.from(this.activeRangesMap.values()); - if (activeRanges.length === 0) {return;} // No ranges to highlight + if (activeRanges.length === 0) { + return; + } // No ranges to highlight // Update active ranges and apply decorations editor.setDecorations(this.decorationType, activeRanges); diff --git a/gui/src/components/StyledMarkdownPreview/utils/remarkTables.tsx b/gui/src/components/StyledMarkdownPreview/utils/remarkTables.tsx index 5958d28fb..db26e4b3f 100644 --- a/gui/src/components/StyledMarkdownPreview/utils/remarkTables.tsx +++ b/gui/src/components/StyledMarkdownPreview/utils/remarkTables.tsx @@ -33,7 +33,7 @@ export function remarkTables() { //// header // newline // |:---|----:| // new line // table rows // prevent modifying if no markdown tables are present - if(!buffer.match(tableRegex)) { + if (!buffer.match(tableRegex)) { return; } diff --git a/gui/src/components/mainInput/belowMainInput/RulesPeek.tsx b/gui/src/components/mainInput/belowMainInput/RulesPeek.tsx index a21f5ea1f..12bd90811 100644 --- a/gui/src/components/mainInput/belowMainInput/RulesPeek.tsx +++ b/gui/src/components/mainInput/belowMainInput/RulesPeek.tsx @@ -35,7 +35,7 @@ const getSourceLabel = (source: RuleSource): string => { }; export function RulesPeekItem({ rule }: RulesPeekItemProps) { - const isGlobal = rule.alwaysApply ?? (!rule.globs); + const isGlobal = rule.alwaysApply ?? !rule.globs; const [expanded, setExpanded] = useState(false); // Define maximum length for rule text display diff --git a/packages/continue-sdk/python/api/README.md b/packages/continue-sdk/python/api/README.md index 75f45e7bb..67200c21e 100644 --- a/packages/continue-sdk/python/api/README.md +++ b/packages/continue-sdk/python/api/README.md @@ -1,21 +1,22 @@ # openapi-client + API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. - This Python package is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project: - API version: 1.0.0 - Package version: 1.0.0 - Generator version: 7.12.0 - Build package: org.openapitools.codegen.languages.PythonClientCodegen -For more information, please visit [https://continue.dev](https://continue.dev) + For more information, please visit [https://continue.dev](https://continue.dev) ## Requirements. Python 3.8+ ## Installation & Usage + ### pip install If the python package is hosted on a repository, you can install directly using: @@ -23,9 +24,11 @@ If the python package is hosted on a repository, you can install directly using: ```sh pip install git+https://github.com/GIT_USER_ID/GIT_REPO_ID.git ``` + (you may need to run `pip` with root permission: `sudo pip install git+https://github.com/GIT_USER_ID/GIT_REPO_ID.git`) Then import the package: + ```python import openapi_client ``` @@ -37,9 +40,11 @@ Install via [Setuptools](http://pypi.python.org/pypi/setuptools). ```sh python setup.py install --user ``` + (or `sudo python setup.py install` to install the package for all users) Then import the package: + ```python import openapi_client ``` @@ -96,32 +101,26 @@ with openapi_client.ApiClient(configuration) as api_client: All URIs are relative to *https://api.continue.dev* -Class | Method | HTTP request | Description ------------- | ------------- | ------------- | ------------- -*DefaultApi* | [**list_assistants**](docs/DefaultApi.md#list_assistants) | **GET** /ide/list-assistants | List assistants for IDE - +| Class | Method | HTTP request | Description | +| ------------ | --------------------------------------------------------- | ---------------------------- | ----------------------- | +| _DefaultApi_ | [**list_assistants**](docs/DefaultApi.md#list_assistants) | **GET** /ide/list-assistants | List assistants for IDE | ## Documentation For Models - - [ListAssistants200ResponseInner](docs/ListAssistants200ResponseInner.md) - - [ListAssistants200ResponseInnerConfigResult](docs/ListAssistants200ResponseInnerConfigResult.md) - - [ListAssistants401Response](docs/ListAssistants401Response.md) - - [ListAssistants404Response](docs/ListAssistants404Response.md) - +- [ListAssistants200ResponseInner](docs/ListAssistants200ResponseInner.md) +- [ListAssistants200ResponseInnerConfigResult](docs/ListAssistants200ResponseInnerConfigResult.md) +- [ListAssistants401Response](docs/ListAssistants401Response.md) +- [ListAssistants404Response](docs/ListAssistants404Response.md) -## Documentation For Authorization +## Documentation For Authorization Authentication schemes defined for the API: + ### apiKeyAuth - **Type**: Bearer authentication - ## Author - - - - diff --git a/packages/continue-sdk/python/api/docs/DefaultApi.md b/packages/continue-sdk/python/api/docs/DefaultApi.md index 9ddf8fb99..b5632904f 100644 --- a/packages/continue-sdk/python/api/docs/DefaultApi.md +++ b/packages/continue-sdk/python/api/docs/DefaultApi.md @@ -2,12 +2,12 @@ All URIs are relative to *https://api.continue.dev* -Method | HTTP request | Description -------------- | ------------- | ------------- -[**list_assistants**](DefaultApi.md#list_assistants) | **GET** /ide/list-assistants | List assistants for IDE - +| Method | HTTP request | Description | +| ---------------------------------------------------- | ---------------------------- | ----------------------- | +| [**list_assistants**](DefaultApi.md#list_assistants) | **GET** /ide/list-assistants | List assistants for IDE | # **list_assistants** + > List[ListAssistants200ResponseInner] list_assistants(always_use_proxy=always_use_proxy, organization_id=organization_id) List assistants for IDE @@ -18,10 +18,9 @@ icons, and other metadata needed by the IDE to display and use them. This endpoint performs a full refresh of the list of assistants, including unrolling configurations and resolving secrets. - ### Example -* Bearer Authentication (apiKeyAuth): +- Bearer Authentication (apiKeyAuth): ```python import openapi_client @@ -61,15 +60,12 @@ with openapi_client.ApiClient(configuration) as api_client: print("Exception when calling DefaultApi->list_assistants: %s\n" % e) ``` - - ### Parameters - -Name | Type | Description | Notes -------------- | ------------- | ------------- | ------------- - **always_use_proxy** | **str**| Whether to always use the Continue-managed proxy for model requests | [optional] - **organization_id** | **str**| ID of the organization to scope assistants to. If not provided, personal assistants are returned. | [optional] +| Name | Type | Description | Notes | +| -------------------- | ------- | ------------------------------------------------------------------------------------------------- | ---------- | +| **always_use_proxy** | **str** | Whether to always use the Continue-managed proxy for model requests | [optional] | +| **organization_id** | **str** | ID of the organization to scope assistants to. If not provided, personal assistants are returned. | [optional] | ### Return type @@ -81,16 +77,15 @@ Name | Type | Description | Notes ### HTTP request headers - - **Content-Type**: Not defined - - **Accept**: application/json +- **Content-Type**: Not defined +- **Accept**: application/json ### HTTP response details -| Status code | Description | Response headers | -|-------------|-------------|------------------| -**200** | Successfully retrieved assistants | - | -**401** | Unauthorized - Authentication failed | - | -**404** | User not found | - | +| Status code | Description | Response headers | +| ----------- | ------------------------------------ | ---------------- | +| **200** | Successfully retrieved assistants | - | +| **401** | Unauthorized - Authentication failed | - | +| **404** | User not found | - | [[Back to top]](#) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to Model list]](../README.md#documentation-for-models) [[Back to README]](../README.md) - diff --git a/packages/continue-sdk/python/api/docs/ListAssistants200ResponseInner.md b/packages/continue-sdk/python/api/docs/ListAssistants200ResponseInner.md index f642b43c2..1fe63505b 100644 --- a/packages/continue-sdk/python/api/docs/ListAssistants200ResponseInner.md +++ b/packages/continue-sdk/python/api/docs/ListAssistants200ResponseInner.md @@ -1,17 +1,16 @@ # ListAssistants200ResponseInner - ## Properties -Name | Type | Description | Notes ------------- | ------------- | ------------- | ------------- -**config_result** | [**ListAssistants200ResponseInnerConfigResult**](ListAssistants200ResponseInnerConfigResult.md) | | -**owner_slug** | **str** | Slug of the user or organization that owns the assistant | -**package_slug** | **str** | Slug of the assistant package | -**icon_url** | **str** | Pre-signed URL for the assistant's icon | [optional] -**on_prem_proxy_url** | **str** | URL of the on-premises proxy if the organization uses one | [optional] -**use_on_prem_proxy** | **bool** | Whether the organization uses an on-premises proxy | [optional] -**raw_yaml** | **str** | Raw YAML configuration of the assistant | [optional] +| Name | Type | Description | Notes | +| --------------------- | ----------------------------------------------------------------------------------------------- | --------------------------------------------------------- | ---------- | +| **config_result** | [**ListAssistants200ResponseInnerConfigResult**](ListAssistants200ResponseInnerConfigResult.md) | | +| **owner_slug** | **str** | Slug of the user or organization that owns the assistant | +| **package_slug** | **str** | Slug of the assistant package | +| **icon_url** | **str** | Pre-signed URL for the assistant's icon | [optional] | +| **on_prem_proxy_url** | **str** | URL of the on-premises proxy if the organization uses one | [optional] | +| **use_on_prem_proxy** | **bool** | Whether the organization uses an on-premises proxy | [optional] | +| **raw_yaml** | **str** | Raw YAML configuration of the assistant | [optional] | ## Example @@ -30,6 +29,5 @@ list_assistants200_response_inner_dict = list_assistants200_response_inner_insta # create an instance of ListAssistants200ResponseInner from a dict list_assistants200_response_inner_from_dict = ListAssistants200ResponseInner.from_dict(list_assistants200_response_inner_dict) ``` + [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) - - diff --git a/packages/continue-sdk/python/api/docs/ListAssistants200ResponseInnerConfigResult.md b/packages/continue-sdk/python/api/docs/ListAssistants200ResponseInnerConfigResult.md index bcee818dd..e12781d04 100644 --- a/packages/continue-sdk/python/api/docs/ListAssistants200ResponseInnerConfigResult.md +++ b/packages/continue-sdk/python/api/docs/ListAssistants200ResponseInnerConfigResult.md @@ -1,13 +1,12 @@ # ListAssistants200ResponseInnerConfigResult - ## Properties -Name | Type | Description | Notes ------------- | ------------- | ------------- | ------------- -**config** | **object** | The unrolled assistant configuration | -**config_load_interrupted** | **bool** | Whether the configuration loading was interrupted | -**errors** | **List[str]** | Any errors that occurred during configuration loading | [optional] +| Name | Type | Description | Notes | +| --------------------------- | ------------- | ----------------------------------------------------- | ---------- | +| **config** | **object** | The unrolled assistant configuration | +| **config_load_interrupted** | **bool** | Whether the configuration loading was interrupted | +| **errors** | **List[str]** | Any errors that occurred during configuration loading | [optional] | ## Example @@ -26,6 +25,5 @@ list_assistants200_response_inner_config_result_dict = list_assistants200_respon # create an instance of ListAssistants200ResponseInnerConfigResult from a dict list_assistants200_response_inner_config_result_from_dict = ListAssistants200ResponseInnerConfigResult.from_dict(list_assistants200_response_inner_config_result_dict) ``` + [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) - - diff --git a/packages/continue-sdk/python/api/docs/ListAssistants401Response.md b/packages/continue-sdk/python/api/docs/ListAssistants401Response.md index f10de1a66..eb64e678d 100644 --- a/packages/continue-sdk/python/api/docs/ListAssistants401Response.md +++ b/packages/continue-sdk/python/api/docs/ListAssistants401Response.md @@ -1,11 +1,10 @@ # ListAssistants401Response - ## Properties -Name | Type | Description | Notes ------------- | ------------- | ------------- | ------------- -**message** | **str** | | [optional] +| Name | Type | Description | Notes | +| ----------- | ------- | ----------- | ---------- | +| **message** | **str** | | [optional] | ## Example @@ -24,6 +23,5 @@ list_assistants401_response_dict = list_assistants401_response_instance.to_dict( # create an instance of ListAssistants401Response from a dict list_assistants401_response_from_dict = ListAssistants401Response.from_dict(list_assistants401_response_dict) ``` + [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) - - diff --git a/packages/continue-sdk/python/api/docs/ListAssistants404Response.md b/packages/continue-sdk/python/api/docs/ListAssistants404Response.md index 5752771ec..a20b0f7b4 100644 --- a/packages/continue-sdk/python/api/docs/ListAssistants404Response.md +++ b/packages/continue-sdk/python/api/docs/ListAssistants404Response.md @@ -1,11 +1,10 @@ # ListAssistants404Response - ## Properties -Name | Type | Description | Notes ------------- | ------------- | ------------- | ------------- -**message** | **str** | | [optional] +| Name | Type | Description | Notes | +| ----------- | ------- | ----------- | ---------- | +| **message** | **str** | | [optional] | ## Example @@ -24,6 +23,5 @@ list_assistants404_response_dict = list_assistants404_response_instance.to_dict( # create an instance of ListAssistants404Response from a dict list_assistants404_response_from_dict = ListAssistants404Response.from_dict(list_assistants404_response_dict) ``` + [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) - - diff --git a/packages/continue-sdk/typescript/api/README.md b/packages/continue-sdk/typescript/api/README.md index ed367262b..fd744e00b 100644 --- a/packages/continue-sdk/typescript/api/README.md +++ b/packages/continue-sdk/typescript/api/README.md @@ -3,23 +3,27 @@ This generator creates TypeScript/JavaScript client that utilizes [Fetch API](https://fetch.spec.whatwg.org/). The generated Node module can be used in the following environments: Environment -* Node.js -* Webpack -* Browserify + +- Node.js +- Webpack +- Browserify Language level -* ES5 - you must have a Promises/A+ library installed -* ES6 + +- ES5 - you must have a Promises/A+ library installed +- ES6 Module system -* CommonJS -* ES6 module system + +- CommonJS +- ES6 module system It can be used in both TypeScript and JavaScript. In TypeScript, the definition will be automatically resolved via `package.json`. ([Reference](https://www.typescriptlang.org/docs/handbook/declaration-files/consumption.html)) ### Building To build and compile the typescript sources to javascript use: + ``` npm install npm run build diff --git a/packages/continue-sdk/typescript/api/src/apis/DefaultApi.ts b/packages/continue-sdk/typescript/api/src/apis/DefaultApi.ts index b5d93c0ed..6bf466f23 100644 --- a/packages/continue-sdk/typescript/api/src/apis/DefaultApi.ts +++ b/packages/continue-sdk/typescript/api/src/apis/DefaultApi.ts @@ -2,119 +2,136 @@ /* eslint-disable */ /** * Continue Hub IDE API - * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. + * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. * * The version of the OpenAPI document: 1.0.0 - * + * * * NOTE: This class is auto generated by OpenAPI Generator (https://openapi-generator.tech). * https://openapi-generator.tech * Do not edit the class manually. */ - -import * as runtime from '../runtime'; +import * as runtime from "../runtime"; import type { ListAssistants200ResponseInner, ListAssistants401Response, ListAssistants404Response, -} from '../models/index'; +} from "../models/index"; import { - ListAssistants200ResponseInnerFromJSON, - ListAssistants200ResponseInnerToJSON, - ListAssistants401ResponseFromJSON, - ListAssistants401ResponseToJSON, - ListAssistants404ResponseFromJSON, - ListAssistants404ResponseToJSON, -} from '../models/index'; + ListAssistants200ResponseInnerFromJSON, + ListAssistants200ResponseInnerToJSON, + ListAssistants401ResponseFromJSON, + ListAssistants401ResponseToJSON, + ListAssistants404ResponseFromJSON, + ListAssistants404ResponseToJSON, +} from "../models/index"; export interface ListAssistantsRequest { - alwaysUseProxy?: ListAssistantsAlwaysUseProxyEnum; - organizationId?: string; + alwaysUseProxy?: ListAssistantsAlwaysUseProxyEnum; + organizationId?: string; } /** * DefaultApi - interface - * + * * @export * @interface DefaultApiInterface */ export interface DefaultApiInterface { - /** - * Returns a complete list of assistants available to the user, with their full configurations, icons, and other metadata needed by the IDE to display and use them. This endpoint performs a full refresh of the list of assistants, including unrolling configurations and resolving secrets. - * @summary List assistants for IDE - * @param {'true' | 'false'} [alwaysUseProxy] Whether to always use the Continue-managed proxy for model requests - * @param {string} [organizationId] ID of the organization to scope assistants to. If not provided, personal assistants are returned. - * @param {*} [options] Override http request option. - * @throws {RequiredError} - * @memberof DefaultApiInterface - */ - listAssistantsRaw(requestParameters: ListAssistantsRequest, initOverrides?: RequestInit | runtime.InitOverrideFunction): Promise>>; - - /** - * Returns a complete list of assistants available to the user, with their full configurations, icons, and other metadata needed by the IDE to display and use them. This endpoint performs a full refresh of the list of assistants, including unrolling configurations and resolving secrets. - * List assistants for IDE - */ - listAssistants(requestParameters: ListAssistantsRequest, initOverrides?: RequestInit | runtime.InitOverrideFunction): Promise>; + /** + * Returns a complete list of assistants available to the user, with their full configurations, icons, and other metadata needed by the IDE to display and use them. This endpoint performs a full refresh of the list of assistants, including unrolling configurations and resolving secrets. + * @summary List assistants for IDE + * @param {'true' | 'false'} [alwaysUseProxy] Whether to always use the Continue-managed proxy for model requests + * @param {string} [organizationId] ID of the organization to scope assistants to. If not provided, personal assistants are returned. + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApiInterface + */ + listAssistantsRaw( + requestParameters: ListAssistantsRequest, + initOverrides?: RequestInit | runtime.InitOverrideFunction, + ): Promise>>; + /** + * Returns a complete list of assistants available to the user, with their full configurations, icons, and other metadata needed by the IDE to display and use them. This endpoint performs a full refresh of the list of assistants, including unrolling configurations and resolving secrets. + * List assistants for IDE + */ + listAssistants( + requestParameters: ListAssistantsRequest, + initOverrides?: RequestInit | runtime.InitOverrideFunction, + ): Promise>; } /** - * + * */ export class DefaultApi extends runtime.BaseAPI implements DefaultApiInterface { + /** + * Returns a complete list of assistants available to the user, with their full configurations, icons, and other metadata needed by the IDE to display and use them. This endpoint performs a full refresh of the list of assistants, including unrolling configurations and resolving secrets. + * List assistants for IDE + */ + async listAssistantsRaw( + requestParameters: ListAssistantsRequest, + initOverrides?: RequestInit | runtime.InitOverrideFunction, + ): Promise>> { + const queryParameters: any = {}; - /** - * Returns a complete list of assistants available to the user, with their full configurations, icons, and other metadata needed by the IDE to display and use them. This endpoint performs a full refresh of the list of assistants, including unrolling configurations and resolving secrets. - * List assistants for IDE - */ - async listAssistantsRaw(requestParameters: ListAssistantsRequest, initOverrides?: RequestInit | runtime.InitOverrideFunction): Promise>> { - const queryParameters: any = {}; - - if (requestParameters['alwaysUseProxy'] != null) { - queryParameters['alwaysUseProxy'] = requestParameters['alwaysUseProxy']; - } - - if (requestParameters['organizationId'] != null) { - queryParameters['organizationId'] = requestParameters['organizationId']; - } - - const headerParameters: runtime.HTTPHeaders = {}; - - if (this.configuration && this.configuration.accessToken) { - const token = this.configuration.accessToken; - const tokenString = await token("apiKeyAuth", []); - - if (tokenString) { - headerParameters["Authorization"] = `Bearer ${tokenString}`; - } - } - const response = await this.request({ - path: `/ide/list-assistants`, - method: 'GET', - headers: headerParameters, - query: queryParameters, - }, initOverrides); - - return new runtime.JSONApiResponse(response, (jsonValue) => jsonValue.map(ListAssistants200ResponseInnerFromJSON)); + if (requestParameters["alwaysUseProxy"] != null) { + queryParameters["alwaysUseProxy"] = requestParameters["alwaysUseProxy"]; } - /** - * Returns a complete list of assistants available to the user, with their full configurations, icons, and other metadata needed by the IDE to display and use them. This endpoint performs a full refresh of the list of assistants, including unrolling configurations and resolving secrets. - * List assistants for IDE - */ - async listAssistants(requestParameters: ListAssistantsRequest = {}, initOverrides?: RequestInit | runtime.InitOverrideFunction): Promise> { - const response = await this.listAssistantsRaw(requestParameters, initOverrides); - return await response.value(); + if (requestParameters["organizationId"] != null) { + queryParameters["organizationId"] = requestParameters["organizationId"]; } + const headerParameters: runtime.HTTPHeaders = {}; + + if (this.configuration && this.configuration.accessToken) { + const token = this.configuration.accessToken; + const tokenString = await token("apiKeyAuth", []); + + if (tokenString) { + headerParameters["Authorization"] = `Bearer ${tokenString}`; + } + } + const response = await this.request( + { + path: `/ide/list-assistants`, + method: "GET", + headers: headerParameters, + query: queryParameters, + }, + initOverrides, + ); + + return new runtime.JSONApiResponse(response, (jsonValue) => + jsonValue.map(ListAssistants200ResponseInnerFromJSON), + ); + } + + /** + * Returns a complete list of assistants available to the user, with their full configurations, icons, and other metadata needed by the IDE to display and use them. This endpoint performs a full refresh of the list of assistants, including unrolling configurations and resolving secrets. + * List assistants for IDE + */ + async listAssistants( + requestParameters: ListAssistantsRequest = {}, + initOverrides?: RequestInit | runtime.InitOverrideFunction, + ): Promise> { + const response = await this.listAssistantsRaw( + requestParameters, + initOverrides, + ); + return await response.value(); + } } /** * @export */ export const ListAssistantsAlwaysUseProxyEnum = { - TRUE: 'true', - FALSE: 'false' + TRUE: "true", + FALSE: "false", } as const; -export type ListAssistantsAlwaysUseProxyEnum = typeof ListAssistantsAlwaysUseProxyEnum[keyof typeof ListAssistantsAlwaysUseProxyEnum]; +export type ListAssistantsAlwaysUseProxyEnum = + (typeof ListAssistantsAlwaysUseProxyEnum)[keyof typeof ListAssistantsAlwaysUseProxyEnum]; diff --git a/packages/continue-sdk/typescript/api/src/apis/index.ts b/packages/continue-sdk/typescript/api/src/apis/index.ts index 69c44c00f..f70f3d328 100644 --- a/packages/continue-sdk/typescript/api/src/apis/index.ts +++ b/packages/continue-sdk/typescript/api/src/apis/index.ts @@ -1,3 +1,3 @@ /* tslint:disable */ /* eslint-disable */ -export * from './DefaultApi'; +export * from "./DefaultApi"; diff --git a/packages/continue-sdk/typescript/api/src/index.ts b/packages/continue-sdk/typescript/api/src/index.ts index bebe8bbbe..ee2637724 100644 --- a/packages/continue-sdk/typescript/api/src/index.ts +++ b/packages/continue-sdk/typescript/api/src/index.ts @@ -1,5 +1,5 @@ /* tslint:disable */ /* eslint-disable */ -export * from './runtime'; -export * from './apis/index'; -export * from './models/index'; +export * from "./runtime"; +export * from "./apis/index"; +export * from "./models/index"; diff --git a/packages/continue-sdk/typescript/api/src/models/ListAssistants200ResponseInner.ts b/packages/continue-sdk/typescript/api/src/models/ListAssistants200ResponseInner.ts index 9f572de3e..015aeb3ba 100644 --- a/packages/continue-sdk/typescript/api/src/models/ListAssistants200ResponseInner.ts +++ b/packages/continue-sdk/typescript/api/src/models/ListAssistants200ResponseInner.ts @@ -2,123 +2,140 @@ /* eslint-disable */ /** * Continue Hub IDE API - * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. + * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. * * The version of the OpenAPI document: 1.0.0 - * + * * * NOTE: This class is auto generated by OpenAPI Generator (https://openapi-generator.tech). * https://openapi-generator.tech * Do not edit the class manually. */ -import { mapValues } from '../runtime'; -import type { ListAssistants200ResponseInnerConfigResult } from './ListAssistants200ResponseInnerConfigResult'; +import { mapValues } from "../runtime"; +import type { ListAssistants200ResponseInnerConfigResult } from "./ListAssistants200ResponseInnerConfigResult"; import { - ListAssistants200ResponseInnerConfigResultFromJSON, - ListAssistants200ResponseInnerConfigResultFromJSONTyped, - ListAssistants200ResponseInnerConfigResultToJSON, - ListAssistants200ResponseInnerConfigResultToJSONTyped, -} from './ListAssistants200ResponseInnerConfigResult'; + ListAssistants200ResponseInnerConfigResultFromJSON, + ListAssistants200ResponseInnerConfigResultFromJSONTyped, + ListAssistants200ResponseInnerConfigResultToJSON, + ListAssistants200ResponseInnerConfigResultToJSONTyped, +} from "./ListAssistants200ResponseInnerConfigResult"; /** - * + * * @export * @interface ListAssistants200ResponseInner */ export interface ListAssistants200ResponseInner { - /** - * - * @type {ListAssistants200ResponseInnerConfigResult} - * @memberof ListAssistants200ResponseInner - */ - configResult: ListAssistants200ResponseInnerConfigResult; - /** - * Slug of the user or organization that owns the assistant - * @type {string} - * @memberof ListAssistants200ResponseInner - */ - ownerSlug: string; - /** - * Slug of the assistant package - * @type {string} - * @memberof ListAssistants200ResponseInner - */ - packageSlug: string; - /** - * Pre-signed URL for the assistant's icon - * @type {string} - * @memberof ListAssistants200ResponseInner - */ - iconUrl?: string | null; - /** - * URL of the on-premises proxy if the organization uses one - * @type {string} - * @memberof ListAssistants200ResponseInner - */ - onPremProxyUrl?: string | null; - /** - * Whether the organization uses an on-premises proxy - * @type {boolean} - * @memberof ListAssistants200ResponseInner - */ - useOnPremProxy?: boolean | null; - /** - * Raw YAML configuration of the assistant - * @type {string} - * @memberof ListAssistants200ResponseInner - */ - rawYaml?: string; + /** + * + * @type {ListAssistants200ResponseInnerConfigResult} + * @memberof ListAssistants200ResponseInner + */ + configResult: ListAssistants200ResponseInnerConfigResult; + /** + * Slug of the user or organization that owns the assistant + * @type {string} + * @memberof ListAssistants200ResponseInner + */ + ownerSlug: string; + /** + * Slug of the assistant package + * @type {string} + * @memberof ListAssistants200ResponseInner + */ + packageSlug: string; + /** + * Pre-signed URL for the assistant's icon + * @type {string} + * @memberof ListAssistants200ResponseInner + */ + iconUrl?: string | null; + /** + * URL of the on-premises proxy if the organization uses one + * @type {string} + * @memberof ListAssistants200ResponseInner + */ + onPremProxyUrl?: string | null; + /** + * Whether the organization uses an on-premises proxy + * @type {boolean} + * @memberof ListAssistants200ResponseInner + */ + useOnPremProxy?: boolean | null; + /** + * Raw YAML configuration of the assistant + * @type {string} + * @memberof ListAssistants200ResponseInner + */ + rawYaml?: string; } /** * Check if a given object implements the ListAssistants200ResponseInner interface. */ -export function instanceOfListAssistants200ResponseInner(value: object): value is ListAssistants200ResponseInner { - if (!('configResult' in value) || value['configResult'] === undefined) return false; - if (!('ownerSlug' in value) || value['ownerSlug'] === undefined) return false; - if (!('packageSlug' in value) || value['packageSlug'] === undefined) return false; - return true; +export function instanceOfListAssistants200ResponseInner( + value: object, +): value is ListAssistants200ResponseInner { + if (!("configResult" in value) || value["configResult"] === undefined) + return false; + if (!("ownerSlug" in value) || value["ownerSlug"] === undefined) return false; + if (!("packageSlug" in value) || value["packageSlug"] === undefined) + return false; + return true; } -export function ListAssistants200ResponseInnerFromJSON(json: any): ListAssistants200ResponseInner { - return ListAssistants200ResponseInnerFromJSONTyped(json, false); +export function ListAssistants200ResponseInnerFromJSON( + json: any, +): ListAssistants200ResponseInner { + return ListAssistants200ResponseInnerFromJSONTyped(json, false); } -export function ListAssistants200ResponseInnerFromJSONTyped(json: any, ignoreDiscriminator: boolean): ListAssistants200ResponseInner { - if (json == null) { - return json; - } - return { - - 'configResult': ListAssistants200ResponseInnerConfigResultFromJSON(json['configResult']), - 'ownerSlug': json['ownerSlug'], - 'packageSlug': json['packageSlug'], - 'iconUrl': json['iconUrl'] == null ? undefined : json['iconUrl'], - 'onPremProxyUrl': json['onPremProxyUrl'] == null ? undefined : json['onPremProxyUrl'], - 'useOnPremProxy': json['useOnPremProxy'] == null ? undefined : json['useOnPremProxy'], - 'rawYaml': json['rawYaml'] == null ? undefined : json['rawYaml'], - }; +export function ListAssistants200ResponseInnerFromJSONTyped( + json: any, + ignoreDiscriminator: boolean, +): ListAssistants200ResponseInner { + if (json == null) { + return json; + } + return { + configResult: ListAssistants200ResponseInnerConfigResultFromJSON( + json["configResult"], + ), + ownerSlug: json["ownerSlug"], + packageSlug: json["packageSlug"], + iconUrl: json["iconUrl"] == null ? undefined : json["iconUrl"], + onPremProxyUrl: + json["onPremProxyUrl"] == null ? undefined : json["onPremProxyUrl"], + useOnPremProxy: + json["useOnPremProxy"] == null ? undefined : json["useOnPremProxy"], + rawYaml: json["rawYaml"] == null ? undefined : json["rawYaml"], + }; } -export function ListAssistants200ResponseInnerToJSON(json: any): ListAssistants200ResponseInner { - return ListAssistants200ResponseInnerToJSONTyped(json, false); +export function ListAssistants200ResponseInnerToJSON( + json: any, +): ListAssistants200ResponseInner { + return ListAssistants200ResponseInnerToJSONTyped(json, false); } -export function ListAssistants200ResponseInnerToJSONTyped(value?: ListAssistants200ResponseInner | null, ignoreDiscriminator: boolean = false): any { - if (value == null) { - return value; - } +export function ListAssistants200ResponseInnerToJSONTyped( + value?: ListAssistants200ResponseInner | null, + ignoreDiscriminator: boolean = false, +): any { + if (value == null) { + return value; + } - return { - - 'configResult': ListAssistants200ResponseInnerConfigResultToJSON(value['configResult']), - 'ownerSlug': value['ownerSlug'], - 'packageSlug': value['packageSlug'], - 'iconUrl': value['iconUrl'], - 'onPremProxyUrl': value['onPremProxyUrl'], - 'useOnPremProxy': value['useOnPremProxy'], - 'rawYaml': value['rawYaml'], - }; + return { + configResult: ListAssistants200ResponseInnerConfigResultToJSON( + value["configResult"], + ), + ownerSlug: value["ownerSlug"], + packageSlug: value["packageSlug"], + iconUrl: value["iconUrl"], + onPremProxyUrl: value["onPremProxyUrl"], + useOnPremProxy: value["useOnPremProxy"], + rawYaml: value["rawYaml"], + }; } - diff --git a/packages/continue-sdk/typescript/api/src/models/ListAssistants200ResponseInnerConfigResult.ts b/packages/continue-sdk/typescript/api/src/models/ListAssistants200ResponseInnerConfigResult.ts index fd2e6ea8c..a3a9e9522 100644 --- a/packages/continue-sdk/typescript/api/src/models/ListAssistants200ResponseInnerConfigResult.ts +++ b/packages/continue-sdk/typescript/api/src/models/ListAssistants200ResponseInnerConfigResult.ts @@ -2,82 +2,95 @@ /* eslint-disable */ /** * Continue Hub IDE API - * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. + * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. * * The version of the OpenAPI document: 1.0.0 - * + * * * NOTE: This class is auto generated by OpenAPI Generator (https://openapi-generator.tech). * https://openapi-generator.tech * Do not edit the class manually. */ -import { mapValues } from '../runtime'; +import { mapValues } from "../runtime"; /** - * + * * @export * @interface ListAssistants200ResponseInnerConfigResult */ export interface ListAssistants200ResponseInnerConfigResult { - /** - * The unrolled assistant configuration - * @type {object} - * @memberof ListAssistants200ResponseInnerConfigResult - */ - config: object | null; - /** - * Whether the configuration loading was interrupted - * @type {boolean} - * @memberof ListAssistants200ResponseInnerConfigResult - */ - configLoadInterrupted: boolean; - /** - * Any errors that occurred during configuration loading - * @type {Array} - * @memberof ListAssistants200ResponseInnerConfigResult - */ - errors?: Array | null; + /** + * The unrolled assistant configuration + * @type {object} + * @memberof ListAssistants200ResponseInnerConfigResult + */ + config: object | null; + /** + * Whether the configuration loading was interrupted + * @type {boolean} + * @memberof ListAssistants200ResponseInnerConfigResult + */ + configLoadInterrupted: boolean; + /** + * Any errors that occurred during configuration loading + * @type {Array} + * @memberof ListAssistants200ResponseInnerConfigResult + */ + errors?: Array | null; } /** * Check if a given object implements the ListAssistants200ResponseInnerConfigResult interface. */ -export function instanceOfListAssistants200ResponseInnerConfigResult(value: object): value is ListAssistants200ResponseInnerConfigResult { - if (!('config' in value) || value['config'] === undefined) return false; - if (!('configLoadInterrupted' in value) || value['configLoadInterrupted'] === undefined) return false; - return true; +export function instanceOfListAssistants200ResponseInnerConfigResult( + value: object, +): value is ListAssistants200ResponseInnerConfigResult { + if (!("config" in value) || value["config"] === undefined) return false; + if ( + !("configLoadInterrupted" in value) || + value["configLoadInterrupted"] === undefined + ) + return false; + return true; } -export function ListAssistants200ResponseInnerConfigResultFromJSON(json: any): ListAssistants200ResponseInnerConfigResult { - return ListAssistants200ResponseInnerConfigResultFromJSONTyped(json, false); +export function ListAssistants200ResponseInnerConfigResultFromJSON( + json: any, +): ListAssistants200ResponseInnerConfigResult { + return ListAssistants200ResponseInnerConfigResultFromJSONTyped(json, false); } -export function ListAssistants200ResponseInnerConfigResultFromJSONTyped(json: any, ignoreDiscriminator: boolean): ListAssistants200ResponseInnerConfigResult { - if (json == null) { - return json; - } - return { - - 'config': json['config'], - 'configLoadInterrupted': json['configLoadInterrupted'], - 'errors': json['errors'] == null ? undefined : json['errors'], - }; +export function ListAssistants200ResponseInnerConfigResultFromJSONTyped( + json: any, + ignoreDiscriminator: boolean, +): ListAssistants200ResponseInnerConfigResult { + if (json == null) { + return json; + } + return { + config: json["config"], + configLoadInterrupted: json["configLoadInterrupted"], + errors: json["errors"] == null ? undefined : json["errors"], + }; } -export function ListAssistants200ResponseInnerConfigResultToJSON(json: any): ListAssistants200ResponseInnerConfigResult { - return ListAssistants200ResponseInnerConfigResultToJSONTyped(json, false); +export function ListAssistants200ResponseInnerConfigResultToJSON( + json: any, +): ListAssistants200ResponseInnerConfigResult { + return ListAssistants200ResponseInnerConfigResultToJSONTyped(json, false); } -export function ListAssistants200ResponseInnerConfigResultToJSONTyped(value?: ListAssistants200ResponseInnerConfigResult | null, ignoreDiscriminator: boolean = false): any { - if (value == null) { - return value; - } +export function ListAssistants200ResponseInnerConfigResultToJSONTyped( + value?: ListAssistants200ResponseInnerConfigResult | null, + ignoreDiscriminator: boolean = false, +): any { + if (value == null) { + return value; + } - return { - - 'config': value['config'], - 'configLoadInterrupted': value['configLoadInterrupted'], - 'errors': value['errors'], - }; + return { + config: value["config"], + configLoadInterrupted: value["configLoadInterrupted"], + errors: value["errors"], + }; } - diff --git a/packages/continue-sdk/typescript/api/src/models/ListAssistants401Response.ts b/packages/continue-sdk/typescript/api/src/models/ListAssistants401Response.ts index a8d7d10b1..9c4a961a5 100644 --- a/packages/continue-sdk/typescript/api/src/models/ListAssistants401Response.ts +++ b/packages/continue-sdk/typescript/api/src/models/ListAssistants401Response.ts @@ -2,64 +2,73 @@ /* eslint-disable */ /** * Continue Hub IDE API - * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. + * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. * * The version of the OpenAPI document: 1.0.0 - * + * * * NOTE: This class is auto generated by OpenAPI Generator (https://openapi-generator.tech). * https://openapi-generator.tech * Do not edit the class manually. */ -import { mapValues } from '../runtime'; +import { mapValues } from "../runtime"; /** - * + * * @export * @interface ListAssistants401Response */ export interface ListAssistants401Response { - /** - * - * @type {string} - * @memberof ListAssistants401Response - */ - message?: string; + /** + * + * @type {string} + * @memberof ListAssistants401Response + */ + message?: string; } /** * Check if a given object implements the ListAssistants401Response interface. */ -export function instanceOfListAssistants401Response(value: object): value is ListAssistants401Response { - return true; +export function instanceOfListAssistants401Response( + value: object, +): value is ListAssistants401Response { + return true; } -export function ListAssistants401ResponseFromJSON(json: any): ListAssistants401Response { - return ListAssistants401ResponseFromJSONTyped(json, false); +export function ListAssistants401ResponseFromJSON( + json: any, +): ListAssistants401Response { + return ListAssistants401ResponseFromJSONTyped(json, false); } -export function ListAssistants401ResponseFromJSONTyped(json: any, ignoreDiscriminator: boolean): ListAssistants401Response { - if (json == null) { - return json; - } - return { - - 'message': json['message'] == null ? undefined : json['message'], - }; +export function ListAssistants401ResponseFromJSONTyped( + json: any, + ignoreDiscriminator: boolean, +): ListAssistants401Response { + if (json == null) { + return json; + } + return { + message: json["message"] == null ? undefined : json["message"], + }; } -export function ListAssistants401ResponseToJSON(json: any): ListAssistants401Response { - return ListAssistants401ResponseToJSONTyped(json, false); +export function ListAssistants401ResponseToJSON( + json: any, +): ListAssistants401Response { + return ListAssistants401ResponseToJSONTyped(json, false); } -export function ListAssistants401ResponseToJSONTyped(value?: ListAssistants401Response | null, ignoreDiscriminator: boolean = false): any { - if (value == null) { - return value; - } +export function ListAssistants401ResponseToJSONTyped( + value?: ListAssistants401Response | null, + ignoreDiscriminator: boolean = false, +): any { + if (value == null) { + return value; + } - return { - - 'message': value['message'], - }; + return { + message: value["message"], + }; } - diff --git a/packages/continue-sdk/typescript/api/src/models/ListAssistants404Response.ts b/packages/continue-sdk/typescript/api/src/models/ListAssistants404Response.ts index b90a21824..8483c508b 100644 --- a/packages/continue-sdk/typescript/api/src/models/ListAssistants404Response.ts +++ b/packages/continue-sdk/typescript/api/src/models/ListAssistants404Response.ts @@ -2,64 +2,73 @@ /* eslint-disable */ /** * Continue Hub IDE API - * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. + * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. * * The version of the OpenAPI document: 1.0.0 - * + * * * NOTE: This class is auto generated by OpenAPI Generator (https://openapi-generator.tech). * https://openapi-generator.tech * Do not edit the class manually. */ -import { mapValues } from '../runtime'; +import { mapValues } from "../runtime"; /** - * + * * @export * @interface ListAssistants404Response */ export interface ListAssistants404Response { - /** - * - * @type {string} - * @memberof ListAssistants404Response - */ - message?: string; + /** + * + * @type {string} + * @memberof ListAssistants404Response + */ + message?: string; } /** * Check if a given object implements the ListAssistants404Response interface. */ -export function instanceOfListAssistants404Response(value: object): value is ListAssistants404Response { - return true; +export function instanceOfListAssistants404Response( + value: object, +): value is ListAssistants404Response { + return true; } -export function ListAssistants404ResponseFromJSON(json: any): ListAssistants404Response { - return ListAssistants404ResponseFromJSONTyped(json, false); +export function ListAssistants404ResponseFromJSON( + json: any, +): ListAssistants404Response { + return ListAssistants404ResponseFromJSONTyped(json, false); } -export function ListAssistants404ResponseFromJSONTyped(json: any, ignoreDiscriminator: boolean): ListAssistants404Response { - if (json == null) { - return json; - } - return { - - 'message': json['message'] == null ? undefined : json['message'], - }; +export function ListAssistants404ResponseFromJSONTyped( + json: any, + ignoreDiscriminator: boolean, +): ListAssistants404Response { + if (json == null) { + return json; + } + return { + message: json["message"] == null ? undefined : json["message"], + }; } -export function ListAssistants404ResponseToJSON(json: any): ListAssistants404Response { - return ListAssistants404ResponseToJSONTyped(json, false); +export function ListAssistants404ResponseToJSON( + json: any, +): ListAssistants404Response { + return ListAssistants404ResponseToJSONTyped(json, false); } -export function ListAssistants404ResponseToJSONTyped(value?: ListAssistants404Response | null, ignoreDiscriminator: boolean = false): any { - if (value == null) { - return value; - } +export function ListAssistants404ResponseToJSONTyped( + value?: ListAssistants404Response | null, + ignoreDiscriminator: boolean = false, +): any { + if (value == null) { + return value; + } - return { - - 'message': value['message'], - }; + return { + message: value["message"], + }; } - diff --git a/packages/continue-sdk/typescript/api/src/models/index.ts b/packages/continue-sdk/typescript/api/src/models/index.ts index fe7e507e7..ccbb2cafa 100644 --- a/packages/continue-sdk/typescript/api/src/models/index.ts +++ b/packages/continue-sdk/typescript/api/src/models/index.ts @@ -1,6 +1,6 @@ /* tslint:disable */ /* eslint-disable */ -export * from './ListAssistants200ResponseInner'; -export * from './ListAssistants200ResponseInnerConfigResult'; -export * from './ListAssistants401Response'; -export * from './ListAssistants404Response'; +export * from "./ListAssistants200ResponseInner"; +export * from "./ListAssistants200ResponseInnerConfigResult"; +export * from "./ListAssistants401Response"; +export * from "./ListAssistants404Response"; diff --git a/packages/continue-sdk/typescript/api/src/runtime.ts b/packages/continue-sdk/typescript/api/src/runtime.ts index 759761f32..3679049f8 100644 --- a/packages/continue-sdk/typescript/api/src/runtime.ts +++ b/packages/continue-sdk/typescript/api/src/runtime.ts @@ -2,86 +2,97 @@ /* eslint-disable */ /** * Continue Hub IDE API - * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. + * API for Continue IDE to fetch assistants and other related information. These endpoints are primarily used by the Continue IDE extensions for VS Code and JetBrains. * * The version of the OpenAPI document: 1.0.0 - * + * * * NOTE: This class is auto generated by OpenAPI Generator (https://openapi-generator.tech). * https://openapi-generator.tech * Do not edit the class manually. */ - export const BASE_PATH = "https://api.continue.dev".replace(/\/+$/, ""); export interface ConfigurationParameters { - basePath?: string; // override base path - fetchApi?: FetchAPI; // override for fetch implementation - middleware?: Middleware[]; // middleware to apply before/after fetch requests - queryParamsStringify?: (params: HTTPQuery) => string; // stringify function for query strings - username?: string; // parameter for basic security - password?: string; // parameter for basic security - apiKey?: string | Promise | ((name: string) => string | Promise); // parameter for apiKey security - accessToken?: string | Promise | ((name?: string, scopes?: string[]) => string | Promise); // parameter for oauth2 security - headers?: HTTPHeaders; //header params we want to use on every request - credentials?: RequestCredentials; //value for the credentials param we want to use on each request + basePath?: string; // override base path + fetchApi?: FetchAPI; // override for fetch implementation + middleware?: Middleware[]; // middleware to apply before/after fetch requests + queryParamsStringify?: (params: HTTPQuery) => string; // stringify function for query strings + username?: string; // parameter for basic security + password?: string; // parameter for basic security + apiKey?: + | string + | Promise + | ((name: string) => string | Promise); // parameter for apiKey security + accessToken?: + | string + | Promise + | ((name?: string, scopes?: string[]) => string | Promise); // parameter for oauth2 security + headers?: HTTPHeaders; //header params we want to use on every request + credentials?: RequestCredentials; //value for the credentials param we want to use on each request } export class Configuration { - constructor(private configuration: ConfigurationParameters = {}) {} + constructor(private configuration: ConfigurationParameters = {}) {} - set config(configuration: Configuration) { - this.configuration = configuration; - } + set config(configuration: Configuration) { + this.configuration = configuration; + } - get basePath(): string { - return this.configuration.basePath != null ? this.configuration.basePath : BASE_PATH; - } + get basePath(): string { + return this.configuration.basePath != null + ? this.configuration.basePath + : BASE_PATH; + } - get fetchApi(): FetchAPI | undefined { - return this.configuration.fetchApi; - } + get fetchApi(): FetchAPI | undefined { + return this.configuration.fetchApi; + } - get middleware(): Middleware[] { - return this.configuration.middleware || []; - } + get middleware(): Middleware[] { + return this.configuration.middleware || []; + } - get queryParamsStringify(): (params: HTTPQuery) => string { - return this.configuration.queryParamsStringify || querystring; - } + get queryParamsStringify(): (params: HTTPQuery) => string { + return this.configuration.queryParamsStringify || querystring; + } - get username(): string | undefined { - return this.configuration.username; - } + get username(): string | undefined { + return this.configuration.username; + } - get password(): string | undefined { - return this.configuration.password; - } + get password(): string | undefined { + return this.configuration.password; + } - get apiKey(): ((name: string) => string | Promise) | undefined { - const apiKey = this.configuration.apiKey; - if (apiKey) { - return typeof apiKey === 'function' ? apiKey : () => apiKey; - } - return undefined; + get apiKey(): ((name: string) => string | Promise) | undefined { + const apiKey = this.configuration.apiKey; + if (apiKey) { + return typeof apiKey === "function" ? apiKey : () => apiKey; } + return undefined; + } - get accessToken(): ((name?: string, scopes?: string[]) => string | Promise) | undefined { - const accessToken = this.configuration.accessToken; - if (accessToken) { - return typeof accessToken === 'function' ? accessToken : async () => accessToken; - } - return undefined; + get accessToken(): + | ((name?: string, scopes?: string[]) => string | Promise) + | undefined { + const accessToken = this.configuration.accessToken; + if (accessToken) { + return typeof accessToken === "function" + ? accessToken + : async () => accessToken; } + return undefined; + } - get headers(): HTTPHeaders | undefined { - return this.configuration.headers; - } + get headers(): HTTPHeaders | undefined { + return this.configuration.headers; + } - get credentials(): RequestCredentials | undefined { - return this.configuration.credentials; - } + get credentials(): RequestCredentials | undefined { + return this.configuration.credentials; + } } export const DefaultConfig = new Configuration(); @@ -90,342 +101,429 @@ export const DefaultConfig = new Configuration(); * This is the base class for all generated API classes. */ export class BaseAPI { + private static readonly jsonRegex = new RegExp( + "^(:?application/json|[^;/ \t]+/[^;/ \t]+[+]json)[ \t]*(:?;.*)?$", + "i", + ); + private middleware: Middleware[]; - private static readonly jsonRegex = new RegExp('^(:?application\/json|[^;/ \t]+\/[^;/ \t]+[+]json)[ \t]*(:?;.*)?$', 'i'); - private middleware: Middleware[]; + constructor(protected configuration = DefaultConfig) { + this.middleware = configuration.middleware; + } - constructor(protected configuration = DefaultConfig) { - this.middleware = configuration.middleware; + withMiddleware(this: T, ...middlewares: Middleware[]) { + const next = this.clone(); + next.middleware = next.middleware.concat(...middlewares); + return next; + } + + withPreMiddleware( + this: T, + ...preMiddlewares: Array + ) { + const middlewares = preMiddlewares.map((pre) => ({ pre })); + return this.withMiddleware(...middlewares); + } + + withPostMiddleware( + this: T, + ...postMiddlewares: Array + ) { + const middlewares = postMiddlewares.map((post) => ({ post })); + return this.withMiddleware(...middlewares); + } + + /** + * Check if the given MIME is a JSON MIME. + * JSON MIME examples: + * application/json + * application/json; charset=UTF8 + * APPLICATION/JSON + * application/vnd.company+json + * @param mime - MIME (Multipurpose Internet Mail Extensions) + * @return True if the given MIME is JSON, false otherwise. + */ + protected isJsonMime(mime: string | null | undefined): boolean { + if (!mime) { + return false; + } + return BaseAPI.jsonRegex.test(mime); + } + + protected async request( + context: RequestOpts, + initOverrides?: RequestInit | InitOverrideFunction, + ): Promise { + const { url, init } = await this.createFetchParams(context, initOverrides); + const response = await this.fetchApi(url, init); + if (response && response.status >= 200 && response.status < 300) { + return response; + } + throw new ResponseError(response, "Response returned an error code"); + } + + private async createFetchParams( + context: RequestOpts, + initOverrides?: RequestInit | InitOverrideFunction, + ) { + let url = this.configuration.basePath + context.path; + if ( + context.query !== undefined && + Object.keys(context.query).length !== 0 + ) { + // only add the querystring to the URL if there are query parameters. + // this is done to avoid urls ending with a "?" character which buggy webservers + // do not handle correctly sometimes. + url += "?" + this.configuration.queryParamsStringify(context.query); } - withMiddleware(this: T, ...middlewares: Middleware[]) { - const next = this.clone(); - next.middleware = next.middleware.concat(...middlewares); - return next; + const headers = Object.assign( + {}, + this.configuration.headers, + context.headers, + ); + Object.keys(headers).forEach((key) => + headers[key] === undefined ? delete headers[key] : {}, + ); + + const initOverrideFn = + typeof initOverrides === "function" + ? initOverrides + : async () => initOverrides; + + const initParams = { + method: context.method, + headers, + body: context.body, + credentials: this.configuration.credentials, + }; + + const overriddenInit: RequestInit = { + ...initParams, + ...(await initOverrideFn({ + init: initParams, + context, + })), + }; + + let body: any; + if ( + isFormData(overriddenInit.body) || + overriddenInit.body instanceof URLSearchParams || + isBlob(overriddenInit.body) + ) { + body = overriddenInit.body; + } else if (this.isJsonMime(headers["Content-Type"])) { + body = JSON.stringify(overriddenInit.body); + } else { + body = overriddenInit.body; } - withPreMiddleware(this: T, ...preMiddlewares: Array) { - const middlewares = preMiddlewares.map((pre) => ({ pre })); - return this.withMiddleware(...middlewares); - } + const init: RequestInit = { + ...overriddenInit, + body, + }; - withPostMiddleware(this: T, ...postMiddlewares: Array) { - const middlewares = postMiddlewares.map((post) => ({ post })); - return this.withMiddleware(...middlewares); - } + return { url, init }; + } - /** - * Check if the given MIME is a JSON MIME. - * JSON MIME examples: - * application/json - * application/json; charset=UTF8 - * APPLICATION/JSON - * application/vnd.company+json - * @param mime - MIME (Multipurpose Internet Mail Extensions) - * @return True if the given MIME is JSON, false otherwise. - */ - protected isJsonMime(mime: string | null | undefined): boolean { - if (!mime) { - return false; + private fetchApi = async (url: string, init: RequestInit) => { + let fetchParams = { url, init }; + for (const middleware of this.middleware) { + if (middleware.pre) { + fetchParams = + (await middleware.pre({ + fetch: this.fetchApi, + ...fetchParams, + })) || fetchParams; + } + } + let response: Response | undefined = undefined; + try { + response = await (this.configuration.fetchApi || fetch)( + fetchParams.url, + fetchParams.init, + ); + } catch (e) { + for (const middleware of this.middleware) { + if (middleware.onError) { + response = + (await middleware.onError({ + fetch: this.fetchApi, + url: fetchParams.url, + init: fetchParams.init, + error: e, + response: response ? response.clone() : undefined, + })) || response; } - return BaseAPI.jsonRegex.test(mime); - } - - protected async request(context: RequestOpts, initOverrides?: RequestInit | InitOverrideFunction): Promise { - const { url, init } = await this.createFetchParams(context, initOverrides); - const response = await this.fetchApi(url, init); - if (response && (response.status >= 200 && response.status < 300)) { - return response; - } - throw new ResponseError(response, 'Response returned an error code'); - } - - private async createFetchParams(context: RequestOpts, initOverrides?: RequestInit | InitOverrideFunction) { - let url = this.configuration.basePath + context.path; - if (context.query !== undefined && Object.keys(context.query).length !== 0) { - // only add the querystring to the URL if there are query parameters. - // this is done to avoid urls ending with a "?" character which buggy webservers - // do not handle correctly sometimes. - url += '?' + this.configuration.queryParamsStringify(context.query); - } - - const headers = Object.assign({}, this.configuration.headers, context.headers); - Object.keys(headers).forEach(key => headers[key] === undefined ? delete headers[key] : {}); - - const initOverrideFn = - typeof initOverrides === "function" - ? initOverrides - : async () => initOverrides; - - const initParams = { - method: context.method, - headers, - body: context.body, - credentials: this.configuration.credentials, - }; - - const overriddenInit: RequestInit = { - ...initParams, - ...(await initOverrideFn({ - init: initParams, - context, - })) - }; - - let body: any; - if (isFormData(overriddenInit.body) - || (overriddenInit.body instanceof URLSearchParams) - || isBlob(overriddenInit.body)) { - body = overriddenInit.body; - } else if (this.isJsonMime(headers['Content-Type'])) { - body = JSON.stringify(overriddenInit.body); + } + if (response === undefined) { + if (e instanceof Error) { + throw new FetchError( + e, + "The request failed and the interceptors did not return an alternative response", + ); } else { - body = overriddenInit.body; + throw e; } - - const init: RequestInit = { - ...overriddenInit, - body - }; - - return { url, init }; + } } - - private fetchApi = async (url: string, init: RequestInit) => { - let fetchParams = { url, init }; - for (const middleware of this.middleware) { - if (middleware.pre) { - fetchParams = await middleware.pre({ - fetch: this.fetchApi, - ...fetchParams, - }) || fetchParams; - } - } - let response: Response | undefined = undefined; - try { - response = await (this.configuration.fetchApi || fetch)(fetchParams.url, fetchParams.init); - } catch (e) { - for (const middleware of this.middleware) { - if (middleware.onError) { - response = await middleware.onError({ - fetch: this.fetchApi, - url: fetchParams.url, - init: fetchParams.init, - error: e, - response: response ? response.clone() : undefined, - }) || response; - } - } - if (response === undefined) { - if (e instanceof Error) { - throw new FetchError(e, 'The request failed and the interceptors did not return an alternative response'); - } else { - throw e; - } - } - } - for (const middleware of this.middleware) { - if (middleware.post) { - response = await middleware.post({ - fetch: this.fetchApi, - url: fetchParams.url, - init: fetchParams.init, - response: response.clone(), - }) || response; - } - } - return response; + for (const middleware of this.middleware) { + if (middleware.post) { + response = + (await middleware.post({ + fetch: this.fetchApi, + url: fetchParams.url, + init: fetchParams.init, + response: response.clone(), + })) || response; + } } + return response; + }; - /** - * Create a shallow clone of `this` by constructing a new instance - * and then shallow cloning data members. - */ - private clone(this: T): T { - const constructor = this.constructor as any; - const next = new constructor(this.configuration); - next.middleware = this.middleware.slice(); - return next; - } -}; + /** + * Create a shallow clone of `this` by constructing a new instance + * and then shallow cloning data members. + */ + private clone(this: T): T { + const constructor = this.constructor as any; + const next = new constructor(this.configuration); + next.middleware = this.middleware.slice(); + return next; + } +} function isBlob(value: any): value is Blob { - return typeof Blob !== 'undefined' && value instanceof Blob; + return typeof Blob !== "undefined" && value instanceof Blob; } function isFormData(value: any): value is FormData { - return typeof FormData !== "undefined" && value instanceof FormData; + return typeof FormData !== "undefined" && value instanceof FormData; } export class ResponseError extends Error { - override name: "ResponseError" = "ResponseError"; - constructor(public response: Response, msg?: string) { - super(msg); - } + override name: "ResponseError" = "ResponseError"; + constructor( + public response: Response, + msg?: string, + ) { + super(msg); + } } export class FetchError extends Error { - override name: "FetchError" = "FetchError"; - constructor(public cause: Error, msg?: string) { - super(msg); - } + override name: "FetchError" = "FetchError"; + constructor( + public cause: Error, + msg?: string, + ) { + super(msg); + } } export class RequiredError extends Error { - override name: "RequiredError" = "RequiredError"; - constructor(public field: string, msg?: string) { - super(msg); - } + override name: "RequiredError" = "RequiredError"; + constructor( + public field: string, + msg?: string, + ) { + super(msg); + } } export const COLLECTION_FORMATS = { - csv: ",", - ssv: " ", - tsv: "\t", - pipes: "|", + csv: ",", + ssv: " ", + tsv: "\t", + pipes: "|", }; -export type FetchAPI = WindowOrWorkerGlobalScope['fetch']; +export type FetchAPI = WindowOrWorkerGlobalScope["fetch"]; export type Json = any; -export type HTTPMethod = 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE' | 'OPTIONS' | 'HEAD'; +export type HTTPMethod = + | "GET" + | "POST" + | "PUT" + | "PATCH" + | "DELETE" + | "OPTIONS" + | "HEAD"; export type HTTPHeaders = { [key: string]: string }; -export type HTTPQuery = { [key: string]: string | number | null | boolean | Array | Set | HTTPQuery }; +export type HTTPQuery = { + [key: string]: + | string + | number + | null + | boolean + | Array + | Set + | HTTPQuery; +}; export type HTTPBody = Json | FormData | URLSearchParams; -export type HTTPRequestInit = { headers?: HTTPHeaders; method: HTTPMethod; credentials?: RequestCredentials; body?: HTTPBody }; -export type ModelPropertyNaming = 'camelCase' | 'snake_case' | 'PascalCase' | 'original'; +export type HTTPRequestInit = { + headers?: HTTPHeaders; + method: HTTPMethod; + credentials?: RequestCredentials; + body?: HTTPBody; +}; +export type ModelPropertyNaming = + | "camelCase" + | "snake_case" + | "PascalCase" + | "original"; -export type InitOverrideFunction = (requestContext: { init: HTTPRequestInit, context: RequestOpts }) => Promise +export type InitOverrideFunction = (requestContext: { + init: HTTPRequestInit; + context: RequestOpts; +}) => Promise; export interface FetchParams { - url: string; - init: RequestInit; + url: string; + init: RequestInit; } export interface RequestOpts { - path: string; - method: HTTPMethod; - headers: HTTPHeaders; - query?: HTTPQuery; - body?: HTTPBody; + path: string; + method: HTTPMethod; + headers: HTTPHeaders; + query?: HTTPQuery; + body?: HTTPBody; } -export function querystring(params: HTTPQuery, prefix: string = ''): string { - return Object.keys(params) - .map(key => querystringSingleKey(key, params[key], prefix)) - .filter(part => part.length > 0) - .join('&'); +export function querystring(params: HTTPQuery, prefix: string = ""): string { + return Object.keys(params) + .map((key) => querystringSingleKey(key, params[key], prefix)) + .filter((part) => part.length > 0) + .join("&"); } -function querystringSingleKey(key: string, value: string | number | null | undefined | boolean | Array | Set | HTTPQuery, keyPrefix: string = ''): string { - const fullKey = keyPrefix + (keyPrefix.length ? `[${key}]` : key); - if (value instanceof Array) { - const multiValue = value.map(singleValue => encodeURIComponent(String(singleValue))) - .join(`&${encodeURIComponent(fullKey)}=`); - return `${encodeURIComponent(fullKey)}=${multiValue}`; - } - if (value instanceof Set) { - const valueAsArray = Array.from(value); - return querystringSingleKey(key, valueAsArray, keyPrefix); - } - if (value instanceof Date) { - return `${encodeURIComponent(fullKey)}=${encodeURIComponent(value.toISOString())}`; - } - if (value instanceof Object) { - return querystring(value as HTTPQuery, fullKey); - } - return `${encodeURIComponent(fullKey)}=${encodeURIComponent(String(value))}`; +function querystringSingleKey( + key: string, + value: + | string + | number + | null + | undefined + | boolean + | Array + | Set + | HTTPQuery, + keyPrefix: string = "", +): string { + const fullKey = keyPrefix + (keyPrefix.length ? `[${key}]` : key); + if (value instanceof Array) { + const multiValue = value + .map((singleValue) => encodeURIComponent(String(singleValue))) + .join(`&${encodeURIComponent(fullKey)}=`); + return `${encodeURIComponent(fullKey)}=${multiValue}`; + } + if (value instanceof Set) { + const valueAsArray = Array.from(value); + return querystringSingleKey(key, valueAsArray, keyPrefix); + } + if (value instanceof Date) { + return `${encodeURIComponent(fullKey)}=${encodeURIComponent(value.toISOString())}`; + } + if (value instanceof Object) { + return querystring(value as HTTPQuery, fullKey); + } + return `${encodeURIComponent(fullKey)}=${encodeURIComponent(String(value))}`; } export function exists(json: any, key: string) { - const value = json[key]; - return value !== null && value !== undefined; + const value = json[key]; + return value !== null && value !== undefined; } export function mapValues(data: any, fn: (item: any) => any) { return Object.keys(data).reduce( (acc, key) => ({ ...acc, [key]: fn(data[key]) }), - {} + {}, ); } export function canConsumeForm(consumes: Consume[]): boolean { - for (const consume of consumes) { - if ('multipart/form-data' === consume.contentType) { - return true; - } + for (const consume of consumes) { + if ("multipart/form-data" === consume.contentType) { + return true; } - return false; + } + return false; } export interface Consume { - contentType: string; + contentType: string; } export interface RequestContext { - fetch: FetchAPI; - url: string; - init: RequestInit; + fetch: FetchAPI; + url: string; + init: RequestInit; } export interface ResponseContext { - fetch: FetchAPI; - url: string; - init: RequestInit; - response: Response; + fetch: FetchAPI; + url: string; + init: RequestInit; + response: Response; } export interface ErrorContext { - fetch: FetchAPI; - url: string; - init: RequestInit; - error: unknown; - response?: Response; + fetch: FetchAPI; + url: string; + init: RequestInit; + error: unknown; + response?: Response; } export interface Middleware { - pre?(context: RequestContext): Promise; - post?(context: ResponseContext): Promise; - onError?(context: ErrorContext): Promise; + pre?(context: RequestContext): Promise; + post?(context: ResponseContext): Promise; + onError?(context: ErrorContext): Promise; } export interface ApiResponse { - raw: Response; - value(): Promise; + raw: Response; + value(): Promise; } export interface ResponseTransformer { - (json: any): T; + (json: any): T; } export class JSONApiResponse { - constructor(public raw: Response, private transformer: ResponseTransformer = (jsonValue: any) => jsonValue) {} + constructor( + public raw: Response, + private transformer: ResponseTransformer = (jsonValue: any) => jsonValue, + ) {} - async value(): Promise { - return this.transformer(await this.raw.json()); - } + async value(): Promise { + return this.transformer(await this.raw.json()); + } } export class VoidApiResponse { - constructor(public raw: Response) {} + constructor(public raw: Response) {} - async value(): Promise { - return undefined; - } + async value(): Promise { + return undefined; + } } export class BlobApiResponse { - constructor(public raw: Response) {} + constructor(public raw: Response) {} - async value(): Promise { - return await this.raw.blob(); - }; + async value(): Promise { + return await this.raw.blob(); + } } export class TextApiResponse { - constructor(public raw: Response) {} + constructor(public raw: Response) {} - async value(): Promise { - return await this.raw.text(); - }; + async value(): Promise { + return await this.raw.text(); + } } diff --git a/packages/continue-sdk/typescript/api/tsconfig.json b/packages/continue-sdk/typescript/api/tsconfig.json index 250280d9a..e69c9c2f7 100644 --- a/packages/continue-sdk/typescript/api/tsconfig.json +++ b/packages/continue-sdk/typescript/api/tsconfig.json @@ -5,12 +5,7 @@ "module": "commonjs", "moduleResolution": "node", "outDir": "dist", - "typeRoots": [ - "node_modules/@types" - ] + "typeRoots": ["node_modules/@types"] }, - "exclude": [ - "dist", - "node_modules" - ] + "exclude": ["dist", "node_modules"] }