|
| 1 | +/** |
| 2 | + * @author grmartin [grmartin] |
| 3 | + * @copyright Crown Copyright 2016 |
| 4 | + * @license Apache-2.0 |
| 5 | + */ |
| 6 | + |
| 7 | +import Operation from "../Operation.mjs"; |
| 8 | + |
| 9 | +// This mapping returns a Promise that resolves to the correct countTokens function for the model. |
| 10 | +const MODEL_TO_COUNT_TOKENS = { |
| 11 | + // cl100k_base models |
| 12 | + "gpt-4": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 13 | + "gpt-4-32k": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 14 | + "gpt-4-turbo": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 15 | + "gpt-4o": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 16 | + "gpt-4-0125-preview": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 17 | + "gpt-4-1106-preview": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 18 | + "gpt-3.5-turbo": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 19 | + "gpt-3.5-turbo-16k": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 20 | + "gpt-3.5-turbo-instruct": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 21 | + "gpt-3.5-turbo-0125": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 22 | + "gpt-3.5-turbo-1106": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), |
| 23 | + "text-embedding-ada-002": () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => m.countTokens), |
| 24 | + "text-embedding-3-large": () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => m.countTokens), |
| 25 | + "text-embedding-3-small": () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => m.countTokens), |
| 26 | + |
| 27 | + // p50k_base models |
| 28 | + "code-davinci-002": () => import("gpt-tokenizer/model/code-davinci-002").then(m => m.countTokens), |
| 29 | + "code-davinci-001": () => import("gpt-tokenizer/model/code-davinci-002").then(m => m.countTokens), |
| 30 | + "code-cushman-002": () => import("gpt-tokenizer/model/code-cushman-002").then(m => m.countTokens), |
| 31 | + "code-cushman-001": () => import("gpt-tokenizer/model/code-cushman-002").then(m => m.countTokens), |
| 32 | + "text-davinci-002": () => import("gpt-tokenizer/model/text-davinci-002").then(m => m.countTokens), |
| 33 | + "text-davinci-003": () => import("gpt-tokenizer/model/text-davinci-003").then(m => m.countTokens), |
| 34 | + |
| 35 | + // p50k_edit models |
| 36 | + "text-davinci-edit-001": () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => m.countTokens), |
| 37 | + "code-davinci-edit-001": () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => m.countTokens), |
| 38 | + |
| 39 | + // r50k_base models |
| 40 | + "davinci": () => import("gpt-tokenizer/model/davinci").then(m => m.countTokens), |
| 41 | + "curie": () => import("gpt-tokenizer/model/curie").then(m => m.countTokens), |
| 42 | + "babbage": () => import("gpt-tokenizer/model/babbage").then(m => m.countTokens), |
| 43 | + "ada": () => import("gpt-tokenizer/model/ada").then(m => m.countTokens), |
| 44 | +}; |
| 45 | + |
| 46 | + |
| 47 | +/** |
| 48 | + * Count AI Tokens operation |
| 49 | + */ |
| 50 | +class CountAITokens extends Operation { |
| 51 | + |
| 52 | + /** |
| 53 | + * Count AI Tokens constructor |
| 54 | + */ |
| 55 | + constructor() { |
| 56 | + super(); |
| 57 | + |
| 58 | + this.name = "Count AI Tokens"; |
| 59 | + this.module = "AI"; |
| 60 | + this.infoURL = "https://github.com/niieani/gpt-tokenizer"; |
| 61 | + this.description = "Counts the number of GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding."; |
| 62 | + this.inputType = "string"; |
| 63 | + this.outputType = "string"; |
| 64 | + this.args = [ |
| 65 | + { |
| 66 | + name: "Model", |
| 67 | + type: "option", |
| 68 | + value: Object.keys(MODEL_TO_COUNT_TOKENS), |
| 69 | + } |
| 70 | + ]; |
| 71 | + } |
| 72 | + |
| 73 | + /** |
| 74 | + * @param {string} input |
| 75 | + * @param {Object[]} args |
| 76 | + * @returns {string} |
| 77 | + */ |
| 78 | + async run(input, args) { |
| 79 | + if (!input) return ""; |
| 80 | + // const [model] = args; |
| 81 | + // // Use the mapping, fallback to cl100k_base if not found |
| 82 | + // const encoding = MODEL_TO_ENCODING[model] || cl100k_base; |
| 83 | + // const tokenCount = encoding.; |
| 84 | + // return tokenCount.toString(); |
| 85 | + const [model] = args; |
| 86 | + let countTokensFn; |
| 87 | + if (MODEL_TO_COUNT_TOKENS[model]) { |
| 88 | + countTokensFn = await MODEL_TO_COUNT_TOKENS[model](); |
| 89 | + } else { |
| 90 | + // fallback to default (gpt-3.5-turbo encoding) |
| 91 | + countTokensFn = (await import("gpt-tokenizer/model/gpt-3.5-turbo")).countTokens; |
| 92 | + } |
| 93 | + const tokenCount = countTokensFn(input); |
| 94 | + return tokenCount.toString(); |
| 95 | + } |
| 96 | + |
| 97 | +} |
| 98 | + |
| 99 | +export default CountAITokens; |
| 100 | + |
0 commit comments