You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

66 lines
1.8 KiB

11 months ago
  1. const { getEncoding } = require("js-tiktoken");
  2. class TikTokenTokenizer {
  3. static MAX_KB_ESTIMATE = 10;
  4. static DIVISOR = 8;
  5. constructor() {
  6. if (TikTokenTokenizer.instance) {
  7. this.log(
  8. "Singleton instance already exists. Returning existing instance."
  9. );
  10. return TikTokenTokenizer.instance;
  11. }
  12. this.encoder = getEncoding("cl100k_base");
  13. TikTokenTokenizer.instance = this;
  14. this.log("Initialized new TikTokenTokenizer instance.");
  15. }
  16. log(text, ...args) {
  17. console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args);
  18. }
  19. /**
  20. * Check if the input is too long to encode
  21. * this is more of a rough estimate and a sanity check to prevent
  22. * CPU issues from encoding too large of strings
  23. * Assumes 1 character = 2 bytes in JS
  24. * @param {string} input
  25. * @returns {boolean}
  26. */
  27. #isTooLong(input) {
  28. const bytesEstimate = input.length * 2;
  29. const kbEstimate = Math.floor(bytesEstimate / 1024);
  30. return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE;
  31. }
  32. /**
  33. * Encode a string into tokens for rough token count estimation.
  34. * @param {string} input
  35. * @returns {number}
  36. */
  37. tokenizeString(input = "") {
  38. try {
  39. if (this.#isTooLong(input)) {
  40. this.log("Input will take too long to encode - estimating");
  41. return Math.ceil(input.length / TikTokenTokenizer.DIVISOR);
  42. }
  43. return this.encoder.encode(input).length;
  44. } catch (e) {
  45. this.log("Could not tokenize string! Estimating...", e.message, e.stack);
  46. return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0;
  47. }
  48. }
  49. }
  50. const tokenizer = new TikTokenTokenizer();
  51. module.exports = {
  52. /**
  53. * Encode a string into tokens for rough token count estimation.
  54. * @param {string} input
  55. * @returns {number}
  56. */
  57. tokenizeString: (input) => tokenizer.tokenizeString(input),
  58. };