From a9a49f490aabc1918b0ef3895f94f2c43dd1a986 Mon Sep 17 00:00:00 2001
From: Webifi <john@webifi.com>
Date: Mon, 12 Jun 2023 16:52:02 -0500
Subject: [PATCH 1/3] Allow automatic extension of truncated summary

---
 src/lib/ChatRequest.svelte      | 163 +++++++++++++++++---------------
 src/lib/ChatSettingField.svelte |   2 +-
 src/lib/Settings.svelte         |  17 +++-
 src/lib/Storage.svelte          |   1 +
 src/lib/Types.svelte            |  16 +++-
 5 files changed, 116 insertions(+), 83 deletions(-)

diff --git a/src/lib/ChatRequest.svelte b/src/lib/ChatRequest.svelte
index 2f1d640..00f62cd 100644
--- a/src/lib/ChatRequest.svelte
+++ b/src/lib/ChatRequest.svelte
@@ -58,50 +58,48 @@ export class ChatRequest {
         const chatResponse = new ChatCompletionResponse(opts)
         const promptTokenCount = countPromptTokens(messagePayload, model)
         const maxAllowed = maxTokens - (promptTokenCount + 1)
-
-        // Build and make the request
-        try {
-          // Build the API request body
-          const request: Request = {
-            model: chatSettings.model,
-            messages: messagePayload,
-            // Provide the settings by mapping the settingsMap to key/value pairs
-            ...getRequestSettingList().reduce((acc, setting) => {
-              const key = setting.key
-              let value = getChatSettingValueNullDefault(chatId, setting)
-              if (key in overrides) value = overrides[key]
-              if (typeof setting.apiTransform === 'function') {
-                value = setting.apiTransform(chatId, setting, value)
-              }
-              if (key === 'max_tokens') {
-                if (opts.maxTokens) value = opts.maxTokens // only as large as requested
-                if (value > maxAllowed || value < 1) value = null // if over max model, do not define max
-              }
-              if (key === 'n') {
-                if (opts.streaming || opts.summaryRequest) {
+    
+        // Build the API request body
+        const request: Request = {
+          model: chatSettings.model,
+          messages: messagePayload,
+          // Provide the settings by mapping the settingsMap to key/value pairs
+          ...getRequestSettingList().reduce((acc, setting) => {
+            const key = setting.key
+            let value = getChatSettingValueNullDefault(chatId, setting)
+            if (key in overrides) value = overrides[key]
+            if (typeof setting.apiTransform === 'function') {
+              value = setting.apiTransform(chatId, setting, value)
+            }
+            if (key === 'max_tokens') {
+              if (opts.maxTokens) value = opts.maxTokens // only as large as requested
+              if (value > maxAllowed || value < 1) value = null // if over max model, do not define max
+            }
+            if (key === 'n') {
+              if (opts.streaming || opts.summaryRequest) {
                 /*
                 Streaming goes insane with more than one completion.
                 Doesn't seem like there's any way to separate the jumbled mess of deltas for the
                 different completions.
                 Summary should only have one completion
                 */
-                  value = 1
-                }
+                value = 1
               }
-              if (value !== null) acc[key] = value
-              return acc
-            }, {}),
-            stream: opts.streaming
-          }
+            }
+            if (value !== null) acc[key] = value
+            return acc
+          }, {}),
+          stream: opts.streaming
+        }
 
+        // Set-up and make the request
+        try {
           // Add out token count to the response handler
           // (streaming doesn't return counts, so we need to do it client side)
           chatResponse.setPromptTokenCount(promptTokenCount)
 
           const signal = _this.controller.signal
-
-          // console.log('apikey', $apiKeyStorage)
-
+    
           const fetchOptions = {
             method: 'POST',
             headers: {
@@ -297,6 +295,7 @@ export class ChatRequest {
            */
     
           const bottom = rw.slice(0 - pinBottom)
+          let continueCounter = chatSettings.summaryExtend + 1
           rw = rw.slice(0, 0 - pinBottom)
           let reductionPoolSize = countPromptTokens(rw, model)
           const ss = chatSettings.summarySize
@@ -340,53 +339,67 @@ export class ChatRequest {
 
           // Request and load the summarization prompt
           _this.updatingMessage = 'Summarizing...'
-          try {
-            const summary = await _this.sendRequest(top.concat(rw).concat([summaryRequest]), {
-              summaryRequest: true,
-              streaming: opts.streaming,
-              maxTokens: maxSummaryTokens,
-              fillMessage: summaryResponse,
-              autoAddMessages: true,
-              onMessageChange: (m) => {
-                if (opts.streaming) scrollToMessage(summaryResponse.uuid, 150, true, true)
+          const summarizedIds = rw.map(m => m.uuid)
+          const summaryIds = [summaryResponse.uuid]
+          while (continueCounter-- > 0) {
+            try {
+              const summary = await _this.sendRequest(top.concat(rw).concat([summaryRequest]), {
+                summaryRequest: true,
+                streaming: opts.streaming,
+                maxTokens: maxSummaryTokens,
+                fillMessage: summaryResponse,
+                autoAddMessages: true,
+                onMessageChange: (m) => {
+                  if (opts.streaming) scrollToMessage(summaryResponse.uuid, 150, true, true)
+                }
+              } as ChatCompletionOpts, {
+                temperature: 0.1, // make summary more deterministic
+                top_p: 1,
+                presence_penalty: 0,
+                frequency_penalty: 0,
+                ...overrides
+              } as ChatSettings)
+              // Wait for the response to complete
+              if (!summary.hasFinished()) await summary.promiseToFinish()
+              if (summary.hasError()) {
+                // Failed for some API issue. let the original caller handle it.
+                _this.updating = false
+                _this.updatingMessage = ''
+                deleteMessage(chatId, srid)
+                return summary
               }
-            } as ChatCompletionOpts, {
-              temperature: 0, // make summary more deterministic
-              top_p: 0.5,
-              presence_penalty: 0,
-              frequency_penalty: 0,
-              ...overrides
-            } as ChatSettings)
-            // Wait for the response to complete
-            if (!summary.hasFinished()) await summary.promiseToFinish()
-            if (summary.hasError()) {
-            // Failed to some API issue. let the original caller handle it.
-              deleteMessage(chatId, summaryResponse.uuid)
-              return summary
-            } else {
-            // Looks like we got our summarized messages.
-            // Mark the new summaries as such
-              summaryResponse.summary = rw.map(m => m.uuid)
-              const summaryIds = [summaryResponse.uuid]
-              // Disable the messages we summarized so they still show in history
-              rw.forEach((m, i) => { m.summarized = summaryIds })
-              saveChatStore()
-              // Re-run request with summarized prompts
-              // return { error: { message: "End for now" } } as Response
-              _this.updatingMessage = 'Continuing...'
-              scrollToBottom(true)
-              return await _this.sendRequest(chat.messages, {
-                ...opts,
-                didSummary: true
-              },
-              overrides)
+              // Looks like we got our summarized messages.
+              // Mark the new summaries as such
+              // Need more?
+              if (summaryResponse.finish_reason === 'length' && continueCounter > 0) {
+                // Our summary was truncated
+                // Try to get more of it
+                delete summaryResponse.finish_reason
+                _this.updatingMessage = 'Summarizing more...'
+                continue
+              } else {
+                // We're done
+                continueCounter = 0
+              }
+            } catch (e) {
+              _this.updating = false
+              _this.updatingMessage = ''
+              deleteMessage(chatId, srid)
+              throw e
             }
-          } catch (e) {
-            _this.updating = false
-            _this.updatingMessage = ''
-            deleteMessage(chatId, srid)
-            throw e
           }
+          summaryResponse.summary = summarizedIds
+          // Disable the messages we summarized so they still show in history
+          rw.forEach((m, i) => { m.summarized = summaryIds })
+          saveChatStore()
+          // Re-run request with summarized prompts
+          _this.updatingMessage = 'Continuing...'
+          scrollToBottom(true)
+          return await _this.sendRequest(chat.messages, {
+            ...opts,
+            didSummary: true
+          },
+          overrides)
         } else {
           /***************
            * Unknown mode.
diff --git a/src/lib/ChatSettingField.svelte b/src/lib/ChatSettingField.svelte
index b33d9c1..629b6a4 100644
--- a/src/lib/ChatSettingField.svelte
+++ b/src/lib/ChatSettingField.svelte
@@ -177,7 +177,7 @@
             placeholder={String(setting.placeholder || chatDefaults[setting.key])}
             on:change={e => queueSettingValueChange(e, setting)}
           />
-        {:else if setting.type === 'select'}
+        {:else if setting.type === 'select' || setting.type === 'select-number'}
           <!-- <div class="select"> -->
             <div class="select" class:control={fieldControls.length}>
             <select id="settings-{setting.key}" title="{setting.title}" on:change={e => queueSettingValueChange(e, setting) } >
diff --git a/src/lib/Settings.svelte b/src/lib/Settings.svelte
index 45819e1..b3dc4b9 100644
--- a/src/lib/Settings.svelte
+++ b/src/lib/Settings.svelte
@@ -60,7 +60,7 @@ const gptDefaults = {
   n: 1,
   stream: true,
   stop: null,
-  max_tokens: 500,
+  max_tokens: 512,
   presence_penalty: 0,
   frequency_penalty: 0,
   logit_bias: null,
@@ -77,6 +77,7 @@ const defaults:ChatSettings = {
   continuousChat: 'fifo',
   summaryThreshold: 3000,
   summarySize: 1000,
+  summaryExtend: 0,
   pinTop: 0,
   pinBottom: 6,
   summaryPrompt: '',
@@ -222,11 +223,23 @@ const summarySettings: ChatSetting[] = [
         name: 'Max Summary Size',
         title: 'Maximum number of tokens allowed for summary response.',
         min: 128,
-        max: 512,
+        max: 1024,
         step: 1,
         type: 'number',
         hide: (chatId) => getChatSettings(chatId).continuousChat !== 'summary'
       },
+      {
+        key: 'summaryExtend',
+        name: 'Summary Extend',
+        title: 'Number of times a truncated summary can be extended.',
+        type: 'select-number',
+        options: [
+          { value: 0, text: '0 - Summary must fit in first call.' },
+          { value: 1, text: '1 - Allow one extra API call to extend.' },
+          { value: 2, text: '2 - Allow two extra API calls to extend.' }
+        ],
+        hide: (chatId) => getChatSettings(chatId).continuousChat !== 'summary'
+      },
       {
         key: 'pinTop',
         name: 'Keep First Prompts',
diff --git a/src/lib/Storage.svelte b/src/lib/Storage.svelte
index b9f7ba1..80eafe0 100644
--- a/src/lib/Storage.svelte
+++ b/src/lib/Storage.svelte
@@ -333,6 +333,7 @@
   export const cleanSettingValue = (type:string, value: any) => {
     switch (type) {
       case 'number':
+      case 'select-number':
         value = parseFloat(value)
         if (isNaN(value)) { value = null }
         return value
diff --git a/src/lib/Types.svelte b/src/lib/Types.svelte
index 376ab99..bc74e34 100644
--- a/src/lib/Types.svelte
+++ b/src/lib/Types.svelte
@@ -60,6 +60,7 @@
     continuousChat: (''|'fifo'|'summary');
     summaryThreshold: number;
     summarySize: number;
+    summaryExtend: number;
     pinTop: number;
     pinBottom: number;
     summaryPrompt: string;
@@ -141,19 +142,24 @@
   };
 
   export type SelectOption = {
-    value: string;
+    value: string|number;
     text: string;
   };
 
-type SettingBoolean = {
-  type: 'boolean';
-};
+  type SettingBoolean = {
+    type: 'boolean';
+  };
 
   export type SettingSelect = {
     type: 'select';
     options: SelectOption[];
   };
 
+  export type SettingSelectNumber = {
+    type: 'select-number';
+    options: SelectOption[];
+  };
+
   export type SettingText = {
     type: 'text';
   };
@@ -199,7 +205,7 @@ type SettingBoolean = {
     fieldControls?: FieldControl[];
     beforeChange?: (chatId:number, setting:ChatSetting, value:any) => boolean;
     afterChange?: (chatId:number, setting:ChatSetting, value:any) => boolean;
-  } & (SettingNumber | SettingSelect | SettingBoolean | SettingText | SettingTextArea | SettingOther | SubSetting);
+  } & (SettingNumber | SettingSelect | SettingSelectNumber | SettingBoolean | SettingText | SettingTextArea | SettingOther | SubSetting);
 
 
   export type GlobalSetting = {

From cd1803ed168154445e898a1b3fb1c8a87f751959 Mon Sep 17 00:00:00 2001
From: Webifi <john@webifi.com>
Date: Mon, 12 Jun 2023 19:20:31 -0500
Subject: [PATCH 2/3] Tiny steps toward UI for hosts/endpoints/models

---
 src/lib/ChatSettingsModal.svelte |  5 ++-
 src/lib/Models.svelte            | 68 ++++++++++++++++++++++++++++++++
 src/lib/Stats.svelte             | 27 ++-----------
 src/lib/Types.svelte             | 18 ++++-----
 4 files changed, 82 insertions(+), 36 deletions(-)
 create mode 100644 src/lib/Models.svelte

diff --git a/src/lib/ChatSettingsModal.svelte b/src/lib/ChatSettingsModal.svelte
index 13ea749..f66d3e8 100644
--- a/src/lib/ChatSettingsModal.svelte
+++ b/src/lib/ChatSettingsModal.svelte
@@ -13,7 +13,7 @@
     checkStateChange,
     addChat
   } from './Storage.svelte'
-  import { supportedModels, type Chat, type ChatSetting, type ResponseModels, type SettingSelect, type SelectOption, type ChatSettings } from './Types.svelte'
+  import type { Chat, ChatSetting, ResponseModels, SettingSelect, SelectOption, ChatSettings } from './Types.svelte'
   import { errorNotice, sizeTextElements } from './Util.svelte'
   import Fa from 'svelte-fa/src/fa.svelte'
   import {
@@ -37,6 +37,7 @@
   import { openModal } from 'svelte-modals'
   import PromptConfirm from './PromptConfirm.svelte'
   import { getApiBase, getEndpointModels } from './ApiUtil.svelte'
+  import { supportedModelKeys } from './Models.svelte'
 
   export let chatId:number
   export const show = () => { showSettings() }
@@ -194,7 +195,7 @@
         }
       })
     ).json()) as ResponseModels
-    const filteredModels = supportedModels.filter((model) => allModels.data.find((m) => m.id === model))
+    const filteredModels = supportedModelKeys.filter((model) => allModels.data.find((m) => m.id === model))
 
     const modelOptions:SelectOption[] = filteredModels.reduce((a, m) => {
       const o:SelectOption = {
diff --git a/src/lib/Models.svelte b/src/lib/Models.svelte
new file mode 100644
index 0000000..54ada48
--- /dev/null
+++ b/src/lib/Models.svelte
@@ -0,0 +1,68 @@
+<script context="module" lang="ts">
+    import type { ModelDetail, Model } from './Types.svelte'
+
+// Reference: https://openai.com/pricing#language-models
+// Eventually we'll add API hosts and endpoints to this
+const modelDetails : Record<string, ModelDetail> = {
+      'gpt-4-32k': {
+        prompt: 0.00006, // $0.06 per 1000 tokens prompt
+        completion: 0.00012, // $0.12 per 1000 tokens completion
+        max: 32768 // 32k max token buffer
+      },
+      'gpt-4': {
+        prompt: 0.00003, // $0.03 per 1000 tokens prompt
+        completion: 0.00006, // $0.06 per 1000 tokens completion
+        max: 8192 // 8k max token buffer
+      },
+      'gpt-3.5': {
+        prompt: 0.000002, // $0.002 per 1000 tokens prompt
+        completion: 0.000002, // $0.002 per 1000 tokens completion
+        max: 4096 // 4k max token buffer
+      }
+}
+
+const unknownDetail = {
+  prompt: 0,
+  completion: 0,
+  max: 4096
+}
+
+// See: https://platform.openai.com/docs/models/model-endpoint-compatibility
+// Eventually we'll add UI for managing this
+export const supportedModels : Record<string, ModelDetail> = {
+      'gpt-4': modelDetails['gpt-4'],
+      'gpt-4-0314': modelDetails['gpt-4'],
+      'gpt-4-32k': modelDetails['gpt-4-32k'],
+      'gpt-4-32k-0314': modelDetails['gpt-4-32k'],
+      'gpt-3.5-turbo': modelDetails['gpt-3.5'],
+      'gpt-3.5-turbo-0301': modelDetails['gpt-3.5']
+}
+
+const lookupList = {
+  ...modelDetails,
+  ...supportedModels
+}
+
+export const supportedModelKeys = Object.keys(supportedModels)
+
+const tpCache : Record<string, ModelDetail> = {}
+
+export const getModelDetail = (model: Model) => {
+      // First try to get exact match, then from cache
+      let r = supportedModels[model] || tpCache[model]
+      if (r) return r
+      // If no exact match, find closest match
+      const k = Object.keys(lookupList)
+        .sort((a, b) => b.length - a.length) // Longest to shortest for best match
+        .find((k) => model.startsWith(k))
+      if (k) {
+        r = lookupList[k]
+      } else {
+        r = unknownDetail
+      }
+      // Cache it so we don't need to do that again
+      tpCache[model] = r
+      return r
+}
+
+</script>
\ No newline at end of file
diff --git a/src/lib/Stats.svelte b/src/lib/Stats.svelte
index fb5c21c..9bb2366 100644
--- a/src/lib/Stats.svelte
+++ b/src/lib/Stats.svelte
@@ -1,32 +1,11 @@
 <script context="module" lang="ts">
+  import { getModelDetail } from './Models.svelte'
   import type { Message, Model, Usage } from './Types.svelte'
   import { encode } from 'gpt-tokenizer'
 
-  // Reference: https://openai.com/pricing#language-models
-  // TODO: Move to settings of some type
-  const modelDetails : Record<string, [number, number, number]> = {
-    'gpt-4-32k': [0.00006, 0.00012, 32768], // $0.06 per 1000 tokens prompt, $0.12 per 1000 tokens completion, max 32k
-    'gpt-4': [0.00003, 0.00006, 8192], // $0.03 per 1000 tokens prompt, $0.06 per 1000 tokens completion, max 8k
-    'gpt-3.5': [0.000002, 0.000002, 4096] // $0.002 per 1000 tokens (both prompt and completion), max 4k
-  }
-
-  const tpCache = {}
-  const getModelDetail = (model: Model) => {
-    let r = tpCache[model]
-    if (r) return r
-    const k = Object.keys(modelDetails).find((k) => model.startsWith(k))
-    if (k) {
-      r = modelDetails[k]
-    } else {
-      r = [0, 0, 4096]
-    }
-    tpCache[model] = r
-    return r
-  }
-
   export const getPrice = (tokens: Usage, model: Model): number => {
     const t = getModelDetail(model)
-    return ((tokens.prompt_tokens * t[0]) + (tokens.completion_tokens * t[1]))
+    return ((tokens.prompt_tokens * t.prompt) + (tokens.completion_tokens * t.completion))
   }
 
   export const countPromptTokens = (prompts:Message[], model:Model):number => {
@@ -44,7 +23,7 @@
   }
 
   export const getModelMaxTokens = (model:Model):number => {
-    return getModelDetail(model)[2]
+    return getModelDetail(model).max
   }
 
 </script>
\ No newline at end of file
diff --git a/src/lib/Types.svelte b/src/lib/Types.svelte
index bc74e34..246eedf 100644
--- a/src/lib/Types.svelte
+++ b/src/lib/Types.svelte
@@ -1,15 +1,13 @@
 <script context="module" lang="ts">
-  // import type internal from "stream";
+  import type { supportedModelKeys } from './Models.svelte'
 
-  export const supportedModels = [ // See: https://platform.openai.com/docs/models/model-endpoint-compatibility
-    'gpt-4',
-    'gpt-4-0314',
-    'gpt-4-32k',
-    'gpt-4-32k-0314',
-    'gpt-3.5-turbo',
-    'gpt-3.5-turbo-0301'
-  ]
-  export type Model = typeof supportedModels[number];
+  export type Model = typeof supportedModelKeys[number];
+
+  export type ModelDetail = {
+    prompt: number;
+    completion: number;
+    max: number;
+  };
 
   export type Usage = {
     completion_tokens: number;

From 111209c5824f25f8ed22f37a48e0b223b1d8d0ca Mon Sep 17 00:00:00 2001
From: Webifi <john@webifi.com>
Date: Mon, 12 Jun 2023 20:29:51 -0500
Subject: [PATCH 3/3] Fix token summary counting issue

---
 src/lib/ChatRequest.svelte | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lib/ChatRequest.svelte b/src/lib/ChatRequest.svelte
index 00f62cd..7f46329 100644
--- a/src/lib/ChatRequest.svelte
+++ b/src/lib/ChatRequest.svelte
@@ -302,13 +302,14 @@ export class ChatRequest {
           const getSS = ():number => (ss < 1 && ss > 0)
             ? Math.round(reductionPoolSize * ss) // If summarySize between 0 and 1, use percentage of reduced
             : Math.min(ss, reductionPoolSize * 0.5) // If > 1, use token count
+          const topSize = countPromptTokens(top, model)
           let maxSummaryTokens = getSS()
           let promptSummary = prepareSummaryPrompt(chatId, maxSummaryTokens)
           const summaryRequest = { role: 'user', content: promptSummary } as Message
           let promptSummarySize = countMessageTokens(summaryRequest, model)
           // Make sure there is enough room to generate the summary, and try to make sure
           // the last prompt is a user prompt as that seems to work better for summaries
-          while ((reductionPoolSize + promptSummarySize + maxSummaryTokens) >= maxTokens ||
+          while ((topSize + reductionPoolSize + promptSummarySize + maxSummaryTokens) >= maxTokens ||
               (reductionPoolSize >= 100 && rw[rw.length - 1]?.role !== 'user')) {
             bottom.unshift(rw.pop() as Message)
             reductionPoolSize = countPromptTokens(rw, model)