You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

293 lines
8.9 KiB

11 months ago
  1. const { v4: uuidv4 } = require("uuid");
  2. const { NativeEmbedder } = require("../../EmbeddingEngines/native");
  3. const {
  4. writeResponseChunk,
  5. clientAbortedHandler,
  6. } = require("../../helpers/chat/responses");
  7. const {
  8. LLMPerformanceMonitor,
  9. } = require("../../helpers/chat/LLMPerformanceMonitor");
  10. function perplexityModels() {
  11. const { MODELS } = require("./models.js");
  12. return MODELS || {};
  13. }
  14. class PerplexityLLM {
  15. constructor(embedder = null, modelPreference = null) {
  16. if (!process.env.PERPLEXITY_API_KEY)
  17. throw new Error("No Perplexity API key was set.");
  18. const { OpenAI: OpenAIApi } = require("openai");
  19. this.openai = new OpenAIApi({
  20. baseURL: "https://api.perplexity.ai",
  21. apiKey: process.env.PERPLEXITY_API_KEY ?? null,
  22. });
  23. this.model =
  24. modelPreference ||
  25. process.env.PERPLEXITY_MODEL_PREF ||
  26. "llama-3-sonar-large-32k-online"; // Give at least a unique model to the provider as last fallback.
  27. this.limits = {
  28. history: this.promptWindowLimit() * 0.15,
  29. system: this.promptWindowLimit() * 0.15,
  30. user: this.promptWindowLimit() * 0.7,
  31. };
  32. this.embedder = embedder ?? new NativeEmbedder();
  33. this.defaultTemp = 0.7;
  34. }
  35. #appendContext(contextTexts = []) {
  36. if (!contextTexts || !contextTexts.length) return "";
  37. return (
  38. "\nContext:\n" +
  39. contextTexts
  40. .map((text, i) => {
  41. return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
  42. })
  43. .join("")
  44. );
  45. }
  46. allModelInformation() {
  47. return perplexityModels();
  48. }
  49. streamingEnabled() {
  50. return "streamGetChatCompletion" in this;
  51. }
  52. static promptWindowLimit(modelName) {
  53. const availableModels = perplexityModels();
  54. return availableModels[modelName]?.maxLength || 4096;
  55. }
  56. promptWindowLimit() {
  57. const availableModels = this.allModelInformation();
  58. return availableModels[this.model]?.maxLength || 4096;
  59. }
  60. async isValidChatCompletionModel(model = "") {
  61. const availableModels = this.allModelInformation();
  62. return availableModels.hasOwnProperty(model);
  63. }
  64. constructPrompt({
  65. systemPrompt = "",
  66. contextTexts = [],
  67. chatHistory = [],
  68. userPrompt = "",
  69. }) {
  70. const prompt = {
  71. role: "system",
  72. content: `${systemPrompt}${this.#appendContext(contextTexts)}`,
  73. };
  74. return [prompt, ...chatHistory, { role: "user", content: userPrompt }];
  75. }
  76. async getChatCompletion(messages = null, { temperature = 0.7 }) {
  77. if (!(await this.isValidChatCompletionModel(this.model)))
  78. throw new Error(
  79. `Perplexity chat: ${this.model} is not valid for chat completion!`
  80. );
  81. const result = await LLMPerformanceMonitor.measureAsyncFunction(
  82. this.openai.chat.completions
  83. .create({
  84. model: this.model,
  85. messages,
  86. temperature,
  87. })
  88. .catch((e) => {
  89. throw new Error(e.message);
  90. })
  91. );
  92. if (
  93. !result.output.hasOwnProperty("choices") ||
  94. result.output.choices.length === 0
  95. )
  96. return null;
  97. return {
  98. textResponse: result.output.choices[0].message.content,
  99. metrics: {
  100. prompt_tokens: result.output.usage?.prompt_tokens || 0,
  101. completion_tokens: result.output.usage?.completion_tokens || 0,
  102. total_tokens: result.output.usage?.total_tokens || 0,
  103. outputTps: result.output.usage?.completion_tokens / result.duration,
  104. duration: result.duration,
  105. },
  106. };
  107. }
  108. async streamGetChatCompletion(messages = null, { temperature = 0.7 }) {
  109. if (!(await this.isValidChatCompletionModel(this.model)))
  110. throw new Error(
  111. `Perplexity chat: ${this.model} is not valid for chat completion!`
  112. );
  113. const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
  114. this.openai.chat.completions.create({
  115. model: this.model,
  116. stream: true,
  117. messages,
  118. temperature,
  119. }),
  120. messages
  121. );
  122. return measuredStreamRequest;
  123. }
  124. enrichToken(token, citations) {
  125. if (Array.isArray(citations) && citations.length !== 0) {
  126. return token.replace(/\[(\d+)\]/g, (match, index) => {
  127. const citationIndex = parseInt(index) - 1;
  128. return citations[citationIndex]
  129. ? `[[${index}](${citations[citationIndex]})]`
  130. : match;
  131. });
  132. }
  133. return token;
  134. }
  135. handleStream(response, stream, responseProps) {
  136. const timeoutThresholdMs = 800;
  137. const { uuid = uuidv4(), sources = [] } = responseProps;
  138. let hasUsageMetrics = false;
  139. let pplxCitations = []; // Array of links
  140. let usage = {
  141. completion_tokens: 0,
  142. };
  143. return new Promise(async (resolve) => {
  144. let fullText = "";
  145. let lastChunkTime = null;
  146. const handleAbort = () => {
  147. stream?.endMeasurement(usage);
  148. clientAbortedHandler(resolve, fullText);
  149. };
  150. response.on("close", handleAbort);
  151. const timeoutCheck = setInterval(() => {
  152. if (lastChunkTime === null) return;
  153. const now = Number(new Date());
  154. const diffMs = now - lastChunkTime;
  155. if (diffMs >= timeoutThresholdMs) {
  156. console.log(
  157. `Perplexity stream did not self-close and has been stale for >${timeoutThresholdMs}ms. Closing response stream.`
  158. );
  159. writeResponseChunk(response, {
  160. uuid,
  161. sources,
  162. type: "textResponseChunk",
  163. textResponse: "",
  164. close: true,
  165. error: false,
  166. });
  167. clearInterval(timeoutCheck);
  168. response.removeListener("close", handleAbort);
  169. stream?.endMeasurement(usage);
  170. resolve(fullText);
  171. }
  172. }, 500);
  173. // Now handle the chunks from the streamed response and append to fullText.
  174. try {
  175. for await (const chunk of stream) {
  176. lastChunkTime = Number(new Date());
  177. const message = chunk?.choices?.[0];
  178. const token = message?.delta?.content;
  179. if (Array.isArray(chunk.citations) && chunk.citations.length !== 0) {
  180. pplxCitations = chunk.citations;
  181. }
  182. // If we see usage metrics in the chunk, we can use them directly
  183. // instead of estimating them, but we only want to assign values if
  184. // the response object is the exact same key:value pair we expect.
  185. if (
  186. chunk.hasOwnProperty("usage") && // exists
  187. !!chunk.usage && // is not null
  188. Object.values(chunk.usage).length > 0 // has values
  189. ) {
  190. if (chunk.usage.hasOwnProperty("prompt_tokens")) {
  191. usage.prompt_tokens = Number(chunk.usage.prompt_tokens);
  192. }
  193. if (chunk.usage.hasOwnProperty("completion_tokens")) {
  194. hasUsageMetrics = true; // to stop estimating counter
  195. usage.completion_tokens = Number(chunk.usage.completion_tokens);
  196. }
  197. }
  198. if (token) {
  199. let enrichedToken = this.enrichToken(token, pplxCitations);
  200. fullText += enrichedToken;
  201. if (!hasUsageMetrics) usage.completion_tokens++;
  202. writeResponseChunk(response, {
  203. uuid,
  204. sources: [],
  205. type: "textResponseChunk",
  206. textResponse: enrichedToken,
  207. close: false,
  208. error: false,
  209. });
  210. }
  211. if (message?.finish_reason) {
  212. console.log("closing");
  213. writeResponseChunk(response, {
  214. uuid,
  215. sources,
  216. type: "textResponseChunk",
  217. textResponse: "",
  218. close: true,
  219. error: false,
  220. });
  221. response.removeListener("close", handleAbort);
  222. stream?.endMeasurement(usage);
  223. clearInterval(timeoutCheck);
  224. resolve(fullText);
  225. break; // Break streaming when a valid finish_reason is first encountered
  226. }
  227. }
  228. } catch (e) {
  229. console.log(`\x1b[43m\x1b[34m[STREAMING ERROR]\x1b[0m ${e.message}`);
  230. writeResponseChunk(response, {
  231. uuid,
  232. type: "abort",
  233. textResponse: null,
  234. sources: [],
  235. close: true,
  236. error: e.message,
  237. });
  238. stream?.endMeasurement(usage);
  239. clearInterval(timeoutCheck);
  240. resolve(fullText); // Return what we currently have - if anything.
  241. }
  242. });
  243. }
  244. // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
  245. async embedTextInput(textInput) {
  246. return await this.embedder.embedTextInput(textInput);
  247. }
  248. async embedChunks(textChunks = []) {
  249. return await this.embedder.embedChunks(textChunks);
  250. }
  251. async compressMessages(promptArgs = {}, rawHistory = []) {
  252. const { messageArrayCompressor } = require("../../helpers/chat");
  253. const messageArray = this.constructPrompt(promptArgs);
  254. return await messageArrayCompressor(this, messageArray, rawHistory);
  255. }
  256. }
  257. module.exports = {
  258. PerplexityLLM,
  259. perplexityModels,
  260. };