فهرست منبع

Added HTML and Typescript UI components to support configration of text extraction engine.

Updated RAG /config and /config/update endpoints to support UI updates.

Fixed .dockerignore to prevent Python venv from being copied into Docker image.
Nicko van Someren 10 ماه پیش
والد
کامیت
7aa35a3757
5فایلهای تغییر یافته به همراه86 افزوده شده و 9 حذف شده
  1. 1 0
      .dockerignore
  2. 25 5
      backend/apps/rag/main.py
  3. 4 4
      backend/config.py
  4. 6 0
      src/lib/apis/rag/index.ts
  5. 50 0
      src/lib/components/admin/Settings/Documents.svelte

+ 1 - 0
.dockerignore

@@ -11,6 +11,7 @@ vite.config.js.timestamp-*
 vite.config.ts.timestamp-*
 __pycache__
 .idea
+venv
 _old
 uploads
 .ipynb_checkpoints

+ 25 - 5
backend/apps/rag/main.py

@@ -93,7 +93,7 @@ from config import (
     SRC_LOG_LEVELS,
     UPLOAD_DIR,
     DOCS_DIR,
-    DOCUMENT_USE_TIKA,
+    TEXT_EXTRACTION_ENGINE,
     TIKA_SERVER_URL,
     RAG_TOP_K,
     RAG_RELEVANCE_THRESHOLD,
@@ -150,6 +150,9 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
     ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
 )
 
+app.state.config.TEXT_EXTRACTION_ENGINE = TEXT_EXTRACTION_ENGINE
+app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
+
 app.state.config.CHUNK_SIZE = CHUNK_SIZE
 app.state.config.CHUNK_OVERLAP = CHUNK_OVERLAP
 
@@ -390,6 +393,10 @@ async def get_rag_config(user=Depends(get_admin_user)):
     return {
         "status": True,
         "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
+        "text_extraction": {
+            "engine": app.state.config.TEXT_EXTRACTION_ENGINE,
+            "tika_server_url": app.state.config.TIKA_SERVER_URL,
+        },
         "chunk": {
             "chunk_size": app.state.config.CHUNK_SIZE,
             "chunk_overlap": app.state.config.CHUNK_OVERLAP,
@@ -419,6 +426,11 @@ async def get_rag_config(user=Depends(get_admin_user)):
     }
 
 
+class TextExtractionConfig(BaseModel):
+    engine: str = ""
+    tika_server_url: Optional[str] = None
+
+
 class ChunkParamUpdateForm(BaseModel):
     chunk_size: int
     chunk_overlap: int
@@ -452,6 +464,7 @@ class WebConfig(BaseModel):
 
 class ConfigUpdateForm(BaseModel):
     pdf_extract_images: Optional[bool] = None
+    text_extraction: Optional[TextExtractionConfig] = None
     chunk: Optional[ChunkParamUpdateForm] = None
     youtube: Optional[YoutubeLoaderConfig] = None
     web: Optional[WebConfig] = None
@@ -465,6 +478,11 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
         else app.state.config.PDF_EXTRACT_IMAGES
     )
 
+    if form_data.text_extraction is not None:
+        log.info(f"Updating text settings: {form_data.text_extraction}")
+        app.state.config.TEXT_EXTRACTION_ENGINE = form_data.text_extraction.engine
+        app.state.config.TIKA_SERVER_URL = form_data.text_extraction.tika_server_url
+
     if form_data.chunk is not None:
         app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size
         app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
@@ -501,6 +519,10 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
     return {
         "status": True,
         "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
+        "text_extraction": {
+            "engine": app.state.config.TEXT_EXTRACTION_ENGINE,
+            "tika_server_url": app.state.config.TIKA_SERVER_URL,
+        },
         "chunk": {
             "chunk_size": app.state.config.CHUNK_SIZE,
             "chunk_overlap": app.state.config.CHUNK_OVERLAP,
@@ -1001,7 +1023,7 @@ class TikaLoader:
         else:
             headers = {}
 
-        endpoint = str(TIKA_SERVER_URL)
+        endpoint = app.state.config.TIKA_SERVER_URL
         if not endpoint.endswith("/"):
             endpoint += "/"
         endpoint += "tika/text"
@@ -1072,9 +1094,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
         "msg",
     ]
 
-    log.warning("Use tika: %s, server URL: %s", DOCUMENT_USE_TIKA, TIKA_SERVER_URL)
-
-    if DOCUMENT_USE_TIKA and TIKA_SERVER_URL:
+    if app.state.config.TEXT_EXTRACTION_ENGINE == "tika" and app.state.config.TIKA_SERVER_URL:
         if file_ext in known_source_ext or (
                 file_content_type and file_content_type.find("text/") >= 0
         ):

+ 4 - 4
backend/config.py

@@ -882,10 +882,10 @@ if WEBUI_AUTH and WEBUI_SECRET_KEY == "":
 # RAG document text extraction
 ####################################
 
-DOCUMENT_USE_TIKA = PersistentConfig(
-    "DOCUMENT_USE_TIKA",
-    "rag.document_use_tika",
-    os.environ.get("DOCUMENT_USE_TIKA", "false").lower() == "true"
+TEXT_EXTRACTION_ENGINE = PersistentConfig(
+    "TEXT_EXTRACTION_ENGINE",
+    "rag.text_extraction_engine",
+    os.environ.get("TEXT_EXTRACTION_ENGINE", "").lower()
 )
 
 TIKA_SERVER_URL = PersistentConfig(

+ 6 - 0
src/lib/apis/rag/index.ts

@@ -32,6 +32,11 @@ type ChunkConfigForm = {
 	chunk_overlap: number;
 };
 
+type TextExtractConfigForm = {
+	engine: string;
+	tika_server_url: string | null;
+};
+
 type YoutubeConfigForm = {
 	language: string[];
 	translation?: string | null;
@@ -40,6 +45,7 @@ type YoutubeConfigForm = {
 type RAGConfigForm = {
 	pdf_extract_images?: boolean;
 	chunk?: ChunkConfigForm;
+	text_extraction?: TextExtractConfigForm;
 	web_loader_ssl_verification?: boolean;
 	youtube?: YoutubeConfigForm;
 };

+ 50 - 0
src/lib/components/admin/Settings/Documents.svelte

@@ -37,6 +37,10 @@
 	let embeddingModel = '';
 	let rerankingModel = '';
 
+	let textExtractionEngine = 'default';
+	let tikaServerUrl = '';
+	let showTikaServerUrl = false;
+
 	let chunkSize = 0;
 	let chunkOverlap = 0;
 	let pdfExtractImages = true;
@@ -163,11 +167,20 @@
 			rerankingModelUpdateHandler();
 		}
 
+		if (textExtractionEngine === 'tika' && tikaServerUrl === '') {
+			toast.error($i18n.t('Tika Server URL required.'));
+			return;
+		}
+
 		const res = await updateRAGConfig(localStorage.token, {
 			pdf_extract_images: pdfExtractImages,
 			chunk: {
 				chunk_overlap: chunkOverlap,
 				chunk_size: chunkSize
+			},
+			text_extraction: {
+				engine: textExtractionEngine,
+				tika_server_url: tikaServerUrl
 			}
 		});
 
@@ -213,6 +226,10 @@
 
 			chunkSize = res.chunk.chunk_size;
 			chunkOverlap = res.chunk.chunk_overlap;
+
+			textExtractionEngine = res.text_extraction.engine;
+			tikaServerUrl = res.text_extraction.tika_server_url;
+			showTikaServerUrl = textExtractionEngine === 'tika';
 		}
 	});
 </script>
@@ -388,6 +405,39 @@
 			</div>
 		</div>
 
+		<hr class="dark:border-gray-850" />
+
+		<div class="">
+			<div class="text-sm font-medium">{$i18n.t('Text Extraction')}</div>
+
+			<div class="flex w-full justify-between mt-2">
+				<div class="self-center text-xs font-medium">{$i18n.t('Engine')}</div>
+				<div class="flex items-center relative">
+					<select
+							class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							bind:value={textExtractionEngine}
+							on:change={(e) => {
+								showTikaServerUrl = (e.target.value === 'tika');
+							}}
+					>
+						<option value="default">{$i18n.t('Default')}</option>
+						<option value="tika">{$i18n.t('Tika')}</option>
+					</select>
+				</div>
+			</div>
+
+			{#if showTikaServerUrl}
+				<div class="flex w-full mt-2">
+					<div class="flex-1 mr-2">
+						<input
+								class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
+								placeholder={$i18n.t('Enter Tika Server URL')}
+								bind:value={tikaServerUrl}
+						/>
+					</div>
+				</div>
+			{/if}
+		</div>
 		<hr class=" dark:border-gray-850 my-1" />
 
 		<div class="space-y-2" />