há 2 meses atrás · f8ac44cfbd
--- a/backend/open_webui/config.py
+++ b/backend/open_webui/config.py
@@ -1654,6 +1654,12 @@ TIKA_SERVER_URL = PersistentConfig(
 
				     os.getenv("TIKA_SERVER_URL", "http://tika:9998"),  # Default for sidecar deployment
			
 
				 )
			
 
				 
			
 
				+DOCLING_SERVER_URL = PersistentConfig(
			
 
				+    "DOCLING_SERVER_URL",
			
 
				+    "rag.docling_server_url",
			
 
				+    os.getenv("DOCLING_SERVER_URL", "http://docling:5001"),
			
 
				+)
			
 
				+
			
 
				 DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
			
 
				     "DOCUMENT_INTELLIGENCE_ENDPOINT",
			
 
				     "rag.document_intelligence_endpoint",
			
--- a/backend/open_webui/main.py
+++ b/backend/open_webui/main.py
@@ -186,6 +186,7 @@ from open_webui.config import (
 
				     CHUNK_SIZE,
			
 
				     CONTENT_EXTRACTION_ENGINE,
			
 
				     TIKA_SERVER_URL,
			
 
				+    DOCLING_SERVER_URL,
			
 
				     DOCUMENT_INTELLIGENCE_ENDPOINT,
			
 
				     DOCUMENT_INTELLIGENCE_KEY,
			
 
				     RAG_TOP_K,
			
@@ -551,6 +552,7 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
 
				 
			
 
				 app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
			
 
				 app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
			
 
				+app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL
			
 
				 app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
			
 
				 app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
			
 
				 
			
--- a/backend/open_webui/retrieval/loaders/main.py
+++ b/backend/open_webui/retrieval/loaders/main.py
@@ -117,6 +117,52 @@ class TikaLoader:
 
				             raise Exception(f"Error calling Tika: {r.reason}")
			
 
				 
			
 
				 
			
 
				+class DoclingLoader:
			
 
				+    def __init__(self, url, file_path=None, mime_type=None):
			
 
				+        self.url = url.rstrip("/")
			
 
				+        self.file_path = file_path
			
 
				+        self.mime_type = mime_type
			
 
				+
			
 
				+    def load(self) -> list[Document]:
			
 
				+        with open(self.file_path, "rb") as f:
			
 
				+            files = {
			
 
				+                "files": (
			
 
				+                    self.file_path,
			
 
				+                    f,
			
 
				+                    self.mime_type or "application/octet-stream",
			
 
				+                )
			
 
				+            }
			
 
				+
			
 
				+            params = {
			
 
				+                "image_export_mode": "placeholder",
			
 
				+                "table_mode": "accurate",
			
 
				+            }
			
 
				+
			
 
				+            endpoint = f"{self.url}/v1alpha/convert/file"
			
 
				+            r = requests.post(endpoint, files=files, data=params)
			
 
				+
			
 
				+        if r.ok:
			
 
				+            result = r.json()
			
 
				+            document_data = result.get("document", {})
			
 
				+            text = document_data.get("md_content", "<No text content found>")
			
 
				+
			
 
				+            metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
			
 
				+
			
 
				+            log.debug("Docling extracted text: %s", text)
			
 
				+
			
 
				+            return [Document(page_content=text, metadata=metadata)]
			
 
				+        else:
			
 
				+            error_msg = f"Error calling Docling API: {r.reason}"
			
 
				+            if r.text:
			
 
				+                try:
			
 
				+                    error_data = r.json()
			
 
				+                    if "detail" in error_data:
			
 
				+                        error_msg += f" - {error_data['detail']}"
			
 
				+                except Exception:
			
 
				+                    error_msg += f" - {r.text}"
			
 
				+            raise Exception(f"Error calling Docling: {error_msg}")
			
 
				+
			
 
				+
			
 
				 class Loader:
			
 
				     def __init__(self, engine: str = "", **kwargs):
			
 
				         self.engine = engine
			
@@ -149,6 +195,12 @@ class Loader:
 
				                     file_path=file_path,
			
 
				                     mime_type=file_content_type,
			
 
				                 )
			
 
				+        elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
			
 
				+            loader = DoclingLoader(
			
 
				+                url=self.kwargs.get("DOCLING_SERVER_URL"),
			
 
				+                file_path=file_path,
			
 
				+                mime_type=file_content_type,
			
 
				+            )
			
 
				         elif (
			
 
				             self.engine == "document_intelligence"
			
 
				             and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
			
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -358,6 +358,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
 
				         "content_extraction": {
			
 
				             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
			
 
				             "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
			
 
				+            "docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
			
 
				             "document_intelligence_config": {
			
 
				                 "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
			
 
				                 "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
			
@@ -428,6 +429,7 @@ class DocumentIntelligenceConfigForm(BaseModel):
 
				 class ContentExtractionConfig(BaseModel):
			
 
				     engine: str = ""
			
 
				     tika_server_url: Optional[str] = None
			
 
				+    docling_server_url: Optional[str] = None
			
 
				     document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
			
 
				 
			
 
				 
			
@@ -540,6 +542,9 @@ async def update_rag_config(
 
				         request.app.state.config.TIKA_SERVER_URL = (
			
 
				             form_data.content_extraction.tika_server_url
			
 
				         )
			
 
				+        request.app.state.config.DOCLING_SERVER_URL = (
			
 
				+            form_data.content_extraction.docling_server_url
			
 
				+        )
			
 
				         if form_data.content_extraction.document_intelligence_config is not None:
			
 
				             request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
			
 
				                 form_data.content_extraction.document_intelligence_config.endpoint
			
@@ -648,6 +653,7 @@ async def update_rag_config(
 
				         "content_extraction": {
			
 
				             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
			
 
				             "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
			
 
				+            "docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
			
 
				             "document_intelligence_config": {
			
 
				                 "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
			
 
				                 "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
			
@@ -990,6 +996,7 @@ def process_file(
 
				                 loader = Loader(
			
 
				                     engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
			
 
				                     TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
			
 
				+                    DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
			
 
				                     PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
			
 
				                     DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
			
 
				                     DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
			
--- a/src/lib/components/admin/Settings/Documents.svelte
+++ b/src/lib/components/admin/Settings/Documents.svelte
@@ -49,6 +49,8 @@
 
				 	let contentExtractionEngine = 'default';
			
 
				 	let tikaServerUrl = '';
			
 
				 	let showTikaServerUrl = false;
			
 
				+	let doclingServerUrl = '';
			
 
				+	let showDoclingServerUrl = false;
			
 
				 	let documentIntelligenceEndpoint = '';
			
 
				 	let documentIntelligenceKey = '';
			
 
				 	let showDocumentIntelligenceConfig = false;
			
@@ -175,6 +177,10 @@
 
				 			toast.error($i18n.t('Tika Server URL required.'));
			
 
				 			return;
			
 
				 		}
			
 
				+		if (contentExtractionEngine === 'docling' && doclingServerUrl === '') {
			
 
				+			toast.error($i18n.t('Docling Server URL required.'));
			
 
				+			return;
			
 
				+		}
			
 
				 		if (
			
 
				 			contentExtractionEngine === 'document_intelligence' &&
			
 
				 			(documentIntelligenceEndpoint === '' || documentIntelligenceKey === '')
			
@@ -209,6 +215,7 @@
 
				 			content_extraction: {
			
 
				 				engine: contentExtractionEngine,
			
 
				 				tika_server_url: tikaServerUrl,
			
 
				+				docling_server_url: doclingServerUrl,
			
 
				 				document_intelligence_config: {
			
 
				 					key: documentIntelligenceKey,
			
 
				 					endpoint: documentIntelligenceEndpoint
			
@@ -269,7 +276,10 @@
 
				 
			
 
				 			contentExtractionEngine = res.content_extraction.engine;
			
 
				 			tikaServerUrl = res.content_extraction.tika_server_url;
			
 
				+			doclingServerUrl = res.content_extraction.docling_server_url;
			
 
				+
			
 
				 			showTikaServerUrl = contentExtractionEngine === 'tika';
			
 
				+			showDoclingServerUrl = contentExtractionEngine === 'docling';
			
 
				 			documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint;
			
 
				 			documentIntelligenceKey = res.content_extraction.document_intelligence_config.key;
			
 
				 			showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence';
			
@@ -337,6 +347,7 @@
 
				 							>
			
 
				 								<option value="">{$i18n.t('Default')} </option>
			
 
				 								<option value="tika">{$i18n.t('Tika')}</option>
			
 
				+								<option value="docling">{ $i18n.t('Docling') }</option>
			
 
				 								<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
			
 
				 							</select>
			
 
				 						</div>
			
@@ -351,6 +362,14 @@
 
				 								/>
			
 
				 							</div>
			
 
				 						</div>
			
 
				+					{:else if contentExtractionEngine === 'docling'}
			
 
				+						<div class="flex w-full mt-1">
			
 
				+							<input
			
 
				+								class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
			
 
				+								placeholder={$i18n.t('Enter Docling Server URL')}
			
 
				+								bind:value={doclingServerUrl}
			
 
				+							/>
			
 
				+						</div>
			
 
				 					{:else if contentExtractionEngine === 'document_intelligence'}
			
 
				 						<div class="my-0.5 flex gap-2 pr-2">
			
 
				 							<input