You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
const { getEncoding } = require("js-tiktoken");
class TikTokenTokenizer { static MAX_KB_ESTIMATE = 10; static DIVISOR = 8;
constructor() { if (TikTokenTokenizer.instance) { this.log( "Singleton instance already exists. Returning existing instance." ); return TikTokenTokenizer.instance; }
this.encoder = getEncoding("cl100k_base"); TikTokenTokenizer.instance = this; this.log("Initialized new TikTokenTokenizer instance."); }
log(text, ...args) { console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args); }
/** * Check if the input is too long to encode * this is more of a rough estimate and a sanity check to prevent * CPU issues from encoding too large of strings * Assumes 1 character = 2 bytes in JS * @param {string} input * @returns {boolean} */ #isTooLong(input) { const bytesEstimate = input.length * 2; const kbEstimate = Math.floor(bytesEstimate / 1024); return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE; }
/** * Encode a string into tokens for rough token count estimation. * @param {string} input * @returns {number} */ tokenizeString(input = "") { try { if (this.#isTooLong(input)) { this.log("Input will take too long to encode - estimating"); return Math.ceil(input.length / TikTokenTokenizer.DIVISOR); }
return this.encoder.encode(input).length; } catch (e) { this.log("Could not tokenize string! Estimating...", e.message, e.stack); return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0; } }}
const tokenizer = new TikTokenTokenizer();module.exports = { /** * Encode a string into tokens for rough token count estimation. * @param {string} input * @returns {number} */ tokenizeString: (input) => tokenizer.tokenizeString(input),};
|