You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
2.6 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const { writeToServerDocuments } = require("../utils/files");
  3. const { tokenizeString } = require("../utils/tokenizer");
  4. const { default: slugify } = require("slugify");
  5. // Will remove the last .extension from the input
  6. // and stringify the input + move to lowercase.
  7. function stripAndSlug(input) {
  8. if (!input.includes('.')) return slugify(input, { lower: true });
  9. return slugify(input.split('.').slice(0, -1).join('-'), { lower: true })
  10. }
  11. const METADATA_KEYS = {
  12. possible: {
  13. url: ({ url, title }) => {
  14. let validUrl;
  15. try {
  16. const u = new URL(url);
  17. validUrl = ["https:", "http:"].includes(u.protocol);
  18. } catch { }
  19. if (validUrl) return `web://${url.toLowerCase()}.website`;
  20. return `file://${stripAndSlug(title)}.txt`;
  21. },
  22. title: ({ title }) => `${stripAndSlug(title)}.txt`,
  23. docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' },
  24. description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' },
  25. docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' },
  26. chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` },
  27. published: ({ published }) => {
  28. if (isNaN(Number(published))) return new Date().toLocaleString();
  29. return new Date(Number(published)).toLocaleString()
  30. },
  31. }
  32. }
  33. async function processRawText(textContent, metadata) {
  34. console.log(`-- Working Raw Text doc ${metadata.title} --`);
  35. if (!textContent || textContent.length === 0) {
  36. return {
  37. success: false,
  38. reason: "textContent was empty - nothing to process.",
  39. documents: [],
  40. };
  41. }
  42. const data = {
  43. id: v4(),
  44. url: METADATA_KEYS.possible.url(metadata),
  45. title: METADATA_KEYS.possible.title(metadata),
  46. docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
  47. description: METADATA_KEYS.possible.description(metadata),
  48. docSource: METADATA_KEYS.possible.docSource(metadata),
  49. chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
  50. published: METADATA_KEYS.possible.published(metadata),
  51. wordCount: textContent.split(" ").length,
  52. pageContent: textContent,
  53. token_count_estimate: tokenizeString(textContent),
  54. };
  55. const document = writeToServerDocuments(
  56. data,
  57. `raw-${stripAndSlug(metadata.title)}-${data.id}`
  58. );
  59. console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`);
  60. return { success: true, reason: null, documents: [document] };
  61. }
  62. module.exports = { processRawText }