Преглед на файлове

enh: token text splitter support

Timothy J. Baek преди 6 месеца
родител
ревизия
586e005f0f
променени са 3 файла, в които са добавени 32 реда и са изтрити 7 реда
  1. 10 7
      backend/open_webui/apps/retrieval/main.py
  2. 4 0
      backend/open_webui/apps/retrieval/utils.py
  3. 18 0
      src/lib/components/admin/Settings/Documents.svelte

+ 10 - 7
backend/open_webui/apps/retrieval/main.py

@@ -392,18 +392,19 @@ async def get_rag_config(user=Depends(get_admin_user)):
     return {
         "status": True,
         "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
-        "file": {
-            "max_size": app.state.config.FILE_MAX_SIZE,
-            "max_count": app.state.config.FILE_MAX_COUNT,
-        },
         "content_extraction": {
             "engine": app.state.config.CONTENT_EXTRACTION_ENGINE,
             "tika_server_url": app.state.config.TIKA_SERVER_URL,
         },
         "chunk": {
+            "text_splitter": app.state.config.TEXT_SPLITTER,
             "chunk_size": app.state.config.CHUNK_SIZE,
             "chunk_overlap": app.state.config.CHUNK_OVERLAP,
         },
+        "file": {
+            "max_size": app.state.config.FILE_MAX_SIZE,
+            "max_count": app.state.config.FILE_MAX_COUNT,
+        },
         "youtube": {
             "language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
             "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
@@ -442,6 +443,7 @@ class ContentExtractionConfig(BaseModel):
 
 
 class ChunkParamUpdateForm(BaseModel):
+    text_splitter: Optional[str] = None
     chunk_size: int
     chunk_overlap: int
 
@@ -501,6 +503,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
         app.state.config.TIKA_SERVER_URL = form_data.content_extraction.tika_server_url
 
     if form_data.chunk is not None:
+        app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
         app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size
         app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
 
@@ -547,6 +550,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
             "tika_server_url": app.state.config.TIKA_SERVER_URL,
         },
         "chunk": {
+            "text_splitter": app.state.config.TEXT_SPLITTER,
             "chunk_size": app.state.config.CHUNK_SIZE,
             "chunk_overlap": app.state.config.CHUNK_OVERLAP,
         },
@@ -607,11 +611,10 @@ class QuerySettingsForm(BaseModel):
 async def update_query_settings(
     form_data: QuerySettingsForm, user=Depends(get_admin_user)
 ):
-    app.state.config.RAG_TEMPLATE = (
-        form_data.template if form_data.template != "" else DEFAULT_RAG_TEMPLATE
-    )
+    app.state.config.RAG_TEMPLATE = form_data.template
     app.state.config.TOP_K = form_data.k if form_data.k else 4
     app.state.config.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0
+
     app.state.config.ENABLE_RAG_HYBRID_SEARCH = (
         form_data.hybrid if form_data.hybrid else False
     )

+ 4 - 0
backend/open_webui/apps/retrieval/utils.py

@@ -19,6 +19,7 @@ from open_webui.apps.retrieval.vector.connector import VECTOR_DB_CLIENT
 from open_webui.utils.misc import get_last_user_message
 
 from open_webui.env import SRC_LOG_LEVELS
+from open_webui.config import DEFAULT_RAG_TEMPLATE
 
 
 log = logging.getLogger(__name__)
@@ -239,6 +240,9 @@ def query_collection_with_hybrid_search(
 
 
 def rag_template(template: str, context: str, query: str):
+    if template == "":
+        template = DEFAULT_RAG_TEMPLATE
+
     if "[context]" not in template and "{{CONTEXT}}" not in template:
         log.debug(
             "WARNING: The RAG template does not contain the '[context]' or '{{CONTEXT}}' placeholder."

+ 18 - 0
src/lib/components/admin/Settings/Documents.svelte

@@ -27,6 +27,7 @@
 	import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
 	import Tooltip from '$lib/components/common/Tooltip.svelte';
 	import Switch from '$lib/components/common/Switch.svelte';
+	import { text } from '@sveltejs/kit';
 
 	const i18n = getContext('i18n');
 
@@ -49,6 +50,7 @@
 	let tikaServerUrl = '';
 	let showTikaServerUrl = false;
 
+	let textSplitter = '';
 	let chunkSize = 0;
 	let chunkOverlap = 0;
 	let pdfExtractImages = true;
@@ -178,6 +180,7 @@
 				max_count: fileMaxCount === '' ? null : fileMaxCount
 			},
 			chunk: {
+				text_splitter: textSplitter,
 				chunk_overlap: chunkOverlap,
 				chunk_size: chunkSize
 			},
@@ -223,11 +226,13 @@
 		await setRerankingConfig();
 
 		querySettings = await getQuerySettings(localStorage.token);
+
 		const res = await getRAGConfig(localStorage.token);
 
 		if (res) {
 			pdfExtractImages = res.pdf_extract_images;
 
+			textSplitter = res.chunk.text_splitter;
 			chunkSize = res.chunk.chunk_size;
 			chunkOverlap = res.chunk.chunk_overlap;
 
@@ -639,6 +644,19 @@
 		<div class=" ">
 			<div class="mb-1 text-sm font-medium">{$i18n.t('Chunk Params')}</div>
 
+			<div class="flex w-full justify-between mb-1.5">
+				<div class="self-center text-xs font-medium">{$i18n.t('Text Splitter')}</div>
+				<div class="flex items-center relative">
+					<select
+						class="dark:bg-gray-900 w-fit pr-8 rounded px-2 text-xs bg-transparent outline-none text-right"
+						bind:value={textSplitter}
+					>
+						<option value="">{$i18n.t('Default (Character)')} </option>
+						<option value="token">{$i18n.t('Token (Tiktoken)')}</option>
+					</select>
+				</div>
+			</div>
+
 			<div class=" flex gap-1.5">
 				<div class="  w-full justify-between">
 					<div class="self-center text-xs font-medium min-w-fit mb-1">{$i18n.t('Chunk Size')}</div>