il y a 2 mois · ab1b910d80
--- a/backend/open_webui/config.py
+++ b/backend/open_webui/config.py
@@ -1579,6 +1579,18 @@ TIKA_SERVER_URL = PersistentConfig(
 
				     os.getenv("TIKA_SERVER_URL", "http://tika:9998"),  # Default for sidecar deployment
			
 
				 )
			
 
				 
			
 
				+DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
			
 
				+    "DOCUMENT_INTELLIGENCE_ENDPOINT",
			
 
				+    "rag.document_intelligence_endpoint",
			
 
				+    os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT", ""),
			
 
				+)
			
 
				+
			
 
				+DOCUMENT_INTELLIGENCE_KEY = PersistentConfig(
			
 
				+    "DOCUMENT_INTELLIGENCE_KEY",
			
 
				+    "rag.document_intelligence_key",
			
 
				+    os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""),
			
 
				+)
			
 
				+
			
 
				 RAG_TOP_K = PersistentConfig(
			
 
				     "RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3"))
			
 
				 )
			
--- a/backend/open_webui/main.py
+++ b/backend/open_webui/main.py
@@ -180,6 +180,8 @@ from open_webui.config import (
 
				     CHUNK_SIZE,
			
 
				     CONTENT_EXTRACTION_ENGINE,
			
 
				     TIKA_SERVER_URL,
			
 
				+    DOCUMENT_INTELLIGENCE_ENDPOINT,
			
 
				+    DOCUMENT_INTELLIGENCE_KEY,
			
 
				     RAG_TOP_K,
			
 
				     RAG_TEXT_SPLITTER,
			
 
				     TIKTOKEN_ENCODING_NAME,
			
@@ -533,6 +535,8 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
 
				 
			
 
				 app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
			
 
				 app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
			
 
				+app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
			
 
				+app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
			
 
				 
			
 
				 app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
			
 
				 app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
			
--- a/backend/open_webui/retrieval/loaders/main.py
+++ b/backend/open_webui/retrieval/loaders/main.py
@@ -4,6 +4,7 @@ import ftfy
 
				 import sys
			
 
				 
			
 
				 from langchain_community.document_loaders import (
			
 
				+    AzureAIDocumentIntelligenceLoader,
			
 
				     BSHTMLLoader,
			
 
				     CSVLoader,
			
 
				     Docx2txtLoader,
			
@@ -147,6 +148,27 @@ class Loader:
 
				                     file_path=file_path,
			
 
				                     mime_type=file_content_type,
			
 
				                 )
			
 
				+        elif (
			
 
				+            self.engine == "document_intelligence"
			
 
				+            and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
			
 
				+            and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != ""
			
 
				+            and (
			
 
				+                file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"]
			
 
				+                or file_content_type
			
 
				+                in [
			
 
				+                    "application/vnd.ms-excel",
			
 
				+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
			
 
				+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
			
 
				+                    "application/vnd.ms-powerpoint",
			
 
				+                    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
			
 
				+                ]
			
 
				+            )
			
 
				+        ):
			
 
				+            loader = AzureAIDocumentIntelligenceLoader(
			
 
				+                file_path=file_path,
			
 
				+                api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
			
 
				+                api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),
			
 
				+            )
			
 
				         else:
			
 
				             if file_ext == "pdf":
			
 
				                 loader = PyPDFLoader(
			
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -356,6 +356,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
 
				         "content_extraction": {
			
 
				             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
			
 
				             "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
			
 
				+            "document_intelligence_config": {
			
 
				+                "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
			
 
				+                "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
			
 
				+            },
			
 
				         },
			
 
				         "chunk": {
			
 
				             "text_splitter": request.app.state.config.TEXT_SPLITTER,
			
@@ -411,9 +415,15 @@ class FileConfig(BaseModel):
 
				     max_count: Optional[int] = None
			
 
				 
			
 
				 
			
 
				+class DocumentIntelligenceConfigForm(BaseModel):
			
 
				+    endpoint: str
			
 
				+    key: str
			
 
				+
			
 
				+
			
 
				 class ContentExtractionConfig(BaseModel):
			
 
				     engine: str = ""
			
 
				     tika_server_url: Optional[str] = None
			
 
				+    document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
			
 
				 
			
 
				 
			
 
				 class ChunkParamUpdateForm(BaseModel):
			
@@ -501,13 +511,22 @@ async def update_rag_config(
 
				         request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count
			
 
				 
			
 
				     if form_data.content_extraction is not None:
			
 
				-        log.info(f"Updating text settings: {form_data.content_extraction}")
			
 
				+        log.info(
			
 
				+            f"Updating content extraction: {request.app.state.config.CONTENT_EXTRACTION_ENGINE} to {form_data.content_extraction.engine}"
			
 
				+        )
			
 
				         request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
			
 
				             form_data.content_extraction.engine
			
 
				         )
			
 
				         request.app.state.config.TIKA_SERVER_URL = (
			
 
				             form_data.content_extraction.tika_server_url
			
 
				         )
			
 
				+        if form_data.content_extraction.document_intelligence_config is not None:
			
 
				+            request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
			
 
				+                form_data.content_extraction.document_intelligence_config.endpoint
			
 
				+            )
			
 
				+            request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = (
			
 
				+                form_data.content_extraction.document_intelligence_config.key
			
 
				+            )
			
 
				 
			
 
				     if form_data.chunk is not None:
			
 
				         request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
			
@@ -604,6 +623,10 @@ async def update_rag_config(
 
				         "content_extraction": {
			
 
				             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
			
 
				             "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
			
 
				+            "document_intelligence_config": {
			
 
				+                "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
			
 
				+                "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
			
 
				+            },
			
 
				         },
			
 
				         "chunk": {
			
 
				             "text_splitter": request.app.state.config.TEXT_SPLITTER,
			
@@ -937,6 +960,8 @@ def process_file(
 
				                     engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
			
 
				                     TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
			
 
				                     PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
			
 
				+                    DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
			
 
				+                    DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
			
 
				                 )
			
 
				                 docs = loader.load(
			
 
				                     file.filename, file.meta.get("content_type"), file_path
			
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -71,6 +71,7 @@ validators==0.34.0
 
				 psutil
			
 
				 sentencepiece
			
 
				 soundfile==0.13.1
			
 
				+azure-ai-documentintelligence==1.0.0
			
 
				 
			
 
				 opencv-python-headless==4.11.0.86
			
 
				 rapidocr-onnxruntime==1.3.24
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,6 +78,7 @@ dependencies = [
 
				     "psutil",
			
 
				     "sentencepiece",
			
 
				     "soundfile==0.13.1",
			
 
				+    "azure-ai-documentintelligence==1.0.0",
			
 
				 
			
 
				     "opencv-python-headless==4.11.0.86",
			
 
				     "rapidocr-onnxruntime==1.3.24",
			
--- a/src/lib/apis/retrieval/index.ts
+++ b/src/lib/apis/retrieval/index.ts
@@ -32,9 +32,15 @@ type ChunkConfigForm = {
 
				 	chunk_overlap: number;
			
 
				 };
			
 
				 
			
 
				+type DocumentIntelligenceConfigForm = {
			
 
				+	key: string;
			
 
				+	endpoint: string;
			
 
				+};
			
 
				+
			
 
				 type ContentExtractConfigForm = {
			
 
				 	engine: string;
			
 
				 	tika_server_url: string | null;
			
 
				+	document_intelligence_config: DocumentIntelligenceConfigForm | null;
			
 
				 };
			
 
				 
			
 
				 type YoutubeConfigForm = {
			
--- a/src/lib/components/admin/Settings/Documents.svelte
+++ b/src/lib/components/admin/Settings/Documents.svelte
@@ -49,6 +49,9 @@
 
				 	let contentExtractionEngine = 'default';
			
 
				 	let tikaServerUrl = '';
			
 
				 	let showTikaServerUrl = false;
			
 
				+	let documentIntelligenceEndpoint = '';
			
 
				+	let documentIntelligenceKey = '';
			
 
				+	let showDocumentIntelligenceConfig = false;
			
 
				 
			
 
				 	let textSplitter = '';
			
 
				 	let chunkSize = 0;
			
@@ -176,6 +179,13 @@
 
				 			toast.error($i18n.t('Tika Server URL required.'));
			
 
				 			return;
			
 
				 		}
			
 
				+		if (
			
 
				+			contentExtractionEngine === 'document_intelligence' &&
			
 
				+			(documentIntelligenceEndpoint === '' || documentIntelligenceKey === '')
			
 
				+		) {
			
 
				+			toast.error($i18n.t('Document Intelligence endpoint and key required.'));
			
 
				+			return;
			
 
				+		}
			
 
				 		const res = await updateRAGConfig(localStorage.token, {
			
 
				 			pdf_extract_images: pdfExtractImages,
			
 
				 			enable_google_drive_integration: enableGoogleDriveIntegration,
			
@@ -191,7 +201,11 @@
 
				 			},
			
 
				 			content_extraction: {
			
 
				 				engine: contentExtractionEngine,
			
 
				-				tika_server_url: tikaServerUrl
			
 
				+				tika_server_url: tikaServerUrl,
			
 
				+				document_intelligence_config: {
			
 
				+					key: documentIntelligenceKey,
			
 
				+					endpoint: documentIntelligenceEndpoint
			
 
				+				}
			
 
				 			}
			
 
				 		});
			
 
				 
			
@@ -249,6 +263,9 @@
 
				 			contentExtractionEngine = res.content_extraction.engine;
			
 
				 			tikaServerUrl = res.content_extraction.tika_server_url;
			
 
				 			showTikaServerUrl = contentExtractionEngine === 'tika';
			
 
				+			documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint;
			
 
				+			documentIntelligenceKey = res.content_extraction.document_intelligence_config.key;
			
 
				+			showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence';
			
 
				 
			
 
				 			fileMaxSize = res?.file.max_size ?? '';
			
 
				 			fileMaxCount = res?.file.max_count ?? '';
			
@@ -585,10 +602,12 @@
 
				 						bind:value={contentExtractionEngine}
			
 
				 						on:change={(e) => {
			
 
				 							showTikaServerUrl = e.target.value === 'tika';
			
 
				+							showDocumentIntelligenceConfig = e.target.value === 'document_intelligence';
			
 
				 						}}
			
 
				 					>
			
 
				 						<option value="">{$i18n.t('Default')} </option>
			
 
				 						<option value="tika">{$i18n.t('Tika')}</option>
			
 
				+						<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
			
 
				 					</select>
			
 
				 				</div>
			
 
				 			</div>
			
@@ -604,6 +623,21 @@
 
				 					</div>
			
 
				 				</div>
			
 
				 			{/if}
			
 
				+
			
 
				+			{#if showDocumentIntelligenceConfig}
			
 
				+				<div class="my-0.5 flex gap-2 pr-2">
			
 
				+					<input
			
 
				+						class="flex-1 w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
			
 
				+						placeholder={$i18n.t('Enter Document Intelligence Endpoint')}
			
 
				+						bind:value={documentIntelligenceEndpoint}
			
 
				+					/>
			
 
				+
			
 
				+					<SensitiveInput
			
 
				+						placeholder={$i18n.t('Enter Document Intelligence Key')}
			
 
				+						bind:value={documentIntelligenceKey}
			
 
				+					/>
			
 
				+				</div>
			
 
				+			{/if}
			
 
				 		</div>
			
 
				 
			
 
				 		<hr class=" border-gray-100 dark:border-gray-850" />